aboutsummaryrefslogtreecommitdiff
path: root/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target')
-rw-r--r--lib/Target/AArch64/AArch64.h6
-rw-r--r--lib/Target/AArch64/AArch64.td80
-rw-r--r--lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp12
-rw-r--r--lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp16
-rw-r--r--lib/Target/AArch64/AArch64AsmPrinter.cpp280
-rw-r--r--lib/Target/AArch64/AArch64CallLowering.cpp632
-rw-r--r--lib/Target/AArch64/AArch64CallLowering.h29
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.cpp38
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.h3
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.td88
-rw-r--r--lib/Target/AArch64/AArch64CollectLOH.cpp22
-rw-r--r--lib/Target/AArch64/AArch64Combine.td18
-rw-r--r--lib/Target/AArch64/AArch64CondBrTuning.cpp4
-rw-r--r--lib/Target/AArch64/AArch64ConditionalCompares.cpp6
-rw-r--r--lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp4
-rw-r--r--lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp76
-rw-r--r--lib/Target/AArch64/AArch64FalkorHWPFFix.cpp2
-rw-r--r--lib/Target/AArch64/AArch64FastISel.cpp75
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.cpp301
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.h28
-rw-r--r--lib/Target/AArch64/AArch64ISelDAGToDAG.cpp45
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp535
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.h37
-rw-r--r--lib/Target/AArch64/AArch64InstrAtomics.td65
-rw-r--r--lib/Target/AArch64/AArch64InstrFormats.td220
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.cpp1054
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.h12
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td253
-rw-r--r--lib/Target/AArch64/AArch64InstructionSelector.cpp1096
-rw-r--r--lib/Target/AArch64/AArch64LegalizerInfo.cpp111
-rw-r--r--lib/Target/AArch64/AArch64LegalizerInfo.h3
-rw-r--r--lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp160
-rw-r--r--lib/Target/AArch64/AArch64MCInstLower.cpp2
-rw-r--r--lib/Target/AArch64/AArch64MachineFunctionInfo.h17
-rw-r--r--lib/Target/AArch64/AArch64PBQPRegAlloc.cpp16
-rw-r--r--lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp98
-rw-r--r--lib/Target/AArch64/AArch64RegisterBankInfo.cpp39
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.cpp69
-rw-r--r--lib/Target/AArch64/AArch64SIMDInstrOpt.cpp8
-rw-r--r--lib/Target/AArch64/AArch64SVEInstrInfo.td264
-rw-r--r--lib/Target/AArch64/AArch64SelectionDAGInfo.cpp2
-rw-r--r--lib/Target/AArch64/AArch64SpeculationHardening.cpp13
-rw-r--r--lib/Target/AArch64/AArch64StackOffset.h138
-rw-r--r--lib/Target/AArch64/AArch64StackTagging.cpp394
-rw-r--r--lib/Target/AArch64/AArch64StackTaggingPreRA.cpp209
-rw-r--r--lib/Target/AArch64/AArch64StorePairSuppress.cpp2
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.cpp50
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.h48
-rw-r--r--lib/Target/AArch64/AArch64SystemOperands.td40
-rw-r--r--lib/Target/AArch64/AArch64TargetMachine.cpp35
-rw-r--r--lib/Target/AArch64/AArch64TargetObjectFile.cpp4
-rw-r--r--lib/Target/AArch64/AArch64TargetObjectFile.h3
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.cpp29
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.h14
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp123
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp13
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp22
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp3
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp5
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h2
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp7
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h20
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp2
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp4
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp2
-rw-r--r--lib/Target/AArch64/SVEInstrFormats.td366
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.cpp2
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.h25
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td16
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp21
-rw-r--r--lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp78
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.h10
-rw-r--r--lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp345
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.cpp701
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.h29
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallingConv.td27
-rw-r--r--lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp12
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.cpp6
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUGISel.td78
-rw-r--r--lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def80
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp9
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp272
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp250
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h18
-rw-r--r--lib/Target/AMDGPU/AMDGPUInline.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td126
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp1144
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.h48
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td216
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp1132
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.h52
-rw-r--r--lib/Target/AMDGPU/AMDGPULibCalls.cpp37
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.cpp14
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp20
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.cpp4
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp38
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.cpp1
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.h6
-rw-r--r--lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp592
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp1234
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.h56
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBanks.td6
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.cpp66
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUSearchableTables.td4
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp64
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h67
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp29
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp90
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h20
-rw-r--r--lib/Target/AMDGPU/AMDILCFGStructurizer.cpp8
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp595
-rw-r--r--lib/Target/AMDGPU/BUFInstructions.td694
-rw-r--r--lib/Target/AMDGPU/DSInstructions.td92
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp4
-rw-r--r--lib/Target/AMDGPU/EvergreenInstructions.td60
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td196
-rw-r--r--lib/Target/AMDGPU/GCNDPPCombine.cpp88
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.cpp21
-rw-r--r--lib/Target/AMDGPU/GCNILPSched.cpp1
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.cpp2
-rw-r--r--lib/Target/AMDGPU/GCNNSAReassign.cpp8
-rw-r--r--lib/Target/AMDGPU/GCNRegBankReassign.cpp14
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp26
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.h2
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.cpp31
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.h3
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp37
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h6
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp4
-rw-r--r--lib/Target/AMDGPU/MIMGInstructions.td4
-rw-r--r--lib/Target/AMDGPU/R600AsmPrinter.cpp2
-rw-r--r--lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp4
-rw-r--r--lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp22
-rw-r--r--lib/Target/AMDGPU/R600FrameLowering.h6
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp7
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.cpp22
-rw-r--r--lib/Target/AMDGPU/R600MachineScheduler.cpp8
-rw-r--r--lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp12
-rw-r--r--lib/Target/AMDGPU/R600Packetizer.cpp4
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.cpp2
-rw-r--r--lib/Target/AMDGPU/SIAddIMGInit.cpp4
-rw-r--r--lib/Target/AMDGPU/SIDefines.h6
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp394
-rw-r--r--lib/Target/AMDGPU/SIFixupVectorISel.cpp3
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp114
-rw-r--r--lib/Target/AMDGPU/SIFormMemoryClauses.cpp22
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp34
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.h6
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp1052
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h54
-rw-r--r--lib/Target/AMDGPU/SIInsertWaitcnts.cpp6
-rw-r--r--lib/Target/AMDGPU/SIInstrFormats.td5
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp558
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h44
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td320
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td297
-rw-r--r--lib/Target/AMDGPU/SILoadStoreOptimizer.cpp1117
-rw-r--r--lib/Target/AMDGPU/SILowerControlFlow.cpp60
-rw-r--r--lib/Target/AMDGPU/SILowerI1Copies.cpp49
-rw-r--r--lib/Target/AMDGPU/SILowerSGPRSpills.cpp4
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp9
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h11
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.cpp16
-rw-r--r--lib/Target/AMDGPU/SIMemoryLegalizer.cpp6
-rw-r--r--lib/Target/AMDGPU/SIModeRegister.cpp2
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMasking.cpp2
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp32
-rw-r--r--lib/Target/AMDGPU/SIPeepholeSDWA.cpp32
-rw-r--r--lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp12
-rw-r--r--lib/Target/AMDGPU/SIProgramInfo.h5
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp445
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h15
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.td464
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp42
-rw-r--r--lib/Target/AMDGPU/SIWholeQuadMode.cpp35
-rw-r--r--lib/Target/AMDGPU/SMInstructions.td15
-rw-r--r--lib/Target/AMDGPU/SOPInstructions.td42
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp71
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h24
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp2
-rw-r--r--lib/Target/AMDGPU/VOP1Instructions.td65
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td346
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td69
-rw-r--r--lib/Target/AMDGPU/VOP3PInstructions.td2
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td30
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td12
-rw-r--r--lib/Target/ARC/ARCFrameLowering.h4
-rw-r--r--lib/Target/ARC/ARCISelLowering.cpp2
-rw-r--r--lib/Target/ARC/ARCMachineFunctionInfo.h4
-rw-r--r--lib/Target/ARC/ARCOptAddrMode.cpp16
-rw-r--r--lib/Target/ARC/ARCRegisterInfo.cpp2
-rw-r--r--lib/Target/ARC/ARCTargetMachine.cpp2
-rw-r--r--lib/Target/ARM/A15SDOptimizer.cpp54
-rw-r--r--lib/Target/ARM/ARM.h2
-rw-r--r--lib/Target/ARM/ARM.td42
-rw-r--r--lib/Target/ARM/ARMAsmPrinter.cpp66
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.cpp216
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.h21
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.cpp49
-rw-r--r--lib/Target/ARM/ARMBaseRegisterInfo.h5
-rw-r--r--lib/Target/ARM/ARMBasicBlockInfo.cpp16
-rw-r--r--lib/Target/ARM/ARMBasicBlockInfo.h31
-rw-r--r--lib/Target/ARM/ARMCallLowering.cpp54
-rw-r--r--lib/Target/ARM/ARMCallLowering.h5
-rw-r--r--lib/Target/ARM/ARMCallingConv.cpp2
-rw-r--r--lib/Target/ARM/ARMCodeGenPrepare.cpp88
-rw-r--r--lib/Target/ARM/ARMConstantIslandPass.cpp289
-rw-r--r--lib/Target/ARM/ARMConstantPoolValue.cpp1
-rw-r--r--lib/Target/ARM/ARMExpandPseudoInsts.cpp96
-rw-r--r--lib/Target/ARM/ARMFastISel.cpp88
-rw-r--r--lib/Target/ARM/ARMFrameLowering.cpp65
-rw-r--r--lib/Target/ARM/ARMFrameLowering.h5
-rw-r--r--lib/Target/ARM/ARMISelDAGToDAG.cpp224
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp2073
-rw-r--r--lib/Target/ARM/ARMISelLowering.h45
-rw-r--r--lib/Target/ARM/ARMInstrFormats.td23
-rw-r--r--lib/Target/ARM/ARMInstrInfo.cpp2
-rw-r--r--lib/Target/ARM/ARMInstrInfo.td127
-rw-r--r--lib/Target/ARM/ARMInstrMVE.td1430
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td191
-rw-r--r--lib/Target/ARM/ARMInstrThumb.td16
-rw-r--r--lib/Target/ARM/ARMInstrThumb2.td98
-rw-r--r--lib/Target/ARM/ARMInstrVFP.td96
-rw-r--r--lib/Target/ARM/ARMInstructionSelector.cpp41
-rw-r--r--lib/Target/ARM/ARMLegalizerInfo.cpp2
-rw-r--r--lib/Target/ARM/ARMLoadStoreOptimizer.cpp32
-rw-r--r--lib/Target/ARM/ARMLowOverheadLoops.cpp364
-rw-r--r--lib/Target/ARM/ARMMCInstLower.cpp4
-rw-r--r--lib/Target/ARM/ARMMachineFunctionInfo.h8
-rw-r--r--lib/Target/ARM/ARMParallelDSP.cpp675
-rw-r--r--lib/Target/ARM/ARMPredicates.td2
-rw-r--r--lib/Target/ARM/ARMRegisterInfo.td18
-rw-r--r--lib/Target/ARM/ARMScheduleA9.td4
-rw-r--r--lib/Target/ARM/ARMScheduleM4.td24
-rw-r--r--lib/Target/ARM/ARMSubtarget.cpp14
-rw-r--r--lib/Target/ARM/ARMSubtarget.h30
-rw-r--r--lib/Target/ARM/ARMTargetMachine.cpp13
-rw-r--r--lib/Target/ARM/ARMTargetTransformInfo.cpp370
-rw-r--r--lib/Target/ARM/ARMTargetTransformInfo.h24
-rw-r--r--lib/Target/ARM/AsmParser/ARMAsmParser.cpp251
-rw-r--r--lib/Target/ARM/Disassembler/ARMDisassembler.cpp31
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h20
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp14
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h3
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp6
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp12
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h5
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp21
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp6
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp4
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp2
-rw-r--r--lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp10
-rw-r--r--lib/Target/ARM/MLxExpansionPass.cpp42
-rw-r--r--lib/Target/ARM/MVETailPredication.cpp519
-rw-r--r--lib/Target/ARM/MVEVPTBlockPass.cpp278
-rw-r--r--lib/Target/ARM/Thumb1FrameLowering.cpp8
-rw-r--r--lib/Target/ARM/Thumb1InstrInfo.cpp17
-rw-r--r--lib/Target/ARM/Thumb2ITBlockPass.cpp134
-rw-r--r--lib/Target/ARM/Thumb2InstrInfo.cpp38
-rw-r--r--lib/Target/ARM/Thumb2SizeReduction.cpp28
-rw-r--r--lib/Target/ARM/ThumbRegisterInfo.cpp11
-rw-r--r--lib/Target/AVR/AVRAsmPrinter.cpp2
-rw-r--r--lib/Target/AVR/AVRExpandPseudoInsts.cpp12
-rw-r--r--lib/Target/AVR/AVRFrameLowering.cpp5
-rw-r--r--lib/Target/AVR/AVRISelDAGToDAG.cpp2
-rw-r--r--lib/Target/AVR/AVRISelLowering.cpp27
-rw-r--r--lib/Target/AVR/AVRISelLowering.h4
-rw-r--r--lib/Target/AVR/AVRRegisterInfo.cpp2
-rw-r--r--lib/Target/AVR/AVRTargetMachine.cpp2
-rw-r--r--lib/Target/AVR/AsmParser/AVRAsmParser.cpp8
-rw-r--r--lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp2
-rw-r--r--lib/Target/BPF/AsmParser/BPFAsmParser.cpp6
-rw-r--r--lib/Target/BPF/BPF.h4
-rw-r--r--lib/Target/BPF/BPFAbstractMemberAccess.cpp708
-rw-r--r--lib/Target/BPF/BPFAsmPrinter.cpp2
-rw-r--r--lib/Target/BPF/BPFCORE.h14
-rw-r--r--lib/Target/BPF/BPFFrameLowering.h2
-rw-r--r--lib/Target/BPF/BPFISelDAGToDAG.cpp170
-rw-r--r--lib/Target/BPF/BPFISelLowering.cpp21
-rw-r--r--lib/Target/BPF/BPFInstrInfo.cpp6
-rw-r--r--lib/Target/BPF/BPFInstrInfo.td2
-rw-r--r--lib/Target/BPF/BPFMIChecking.cpp1
-rw-r--r--lib/Target/BPF/BPFMIPeephole.cpp206
-rw-r--r--lib/Target/BPF/BPFMISimplifyPatchable.cpp27
-rw-r--r--lib/Target/BPF/BPFRegisterInfo.cpp6
-rw-r--r--lib/Target/BPF/BPFTargetMachine.cpp16
-rw-r--r--lib/Target/BPF/BTF.h54
-rw-r--r--lib/Target/BPF/BTFDebug.cpp281
-rw-r--r--lib/Target/BPF/BTFDebug.h29
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp4
-rw-r--r--lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp2
-rw-r--r--lib/Target/Hexagon/BitTracker.cpp21
-rw-r--r--lib/Target/Hexagon/HexagonAsmPrinter.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonBitSimplify.cpp71
-rw-r--r--lib/Target/Hexagon/HexagonBitTracker.cpp8
-rw-r--r--lib/Target/Hexagon/HexagonBlockRanges.cpp14
-rw-r--r--lib/Target/Hexagon/HexagonBranchRelaxation.cpp5
-rw-r--r--lib/Target/Hexagon/HexagonConstExtenders.cpp17
-rw-r--r--lib/Target/Hexagon/HexagonConstPropagation.cpp32
-rw-r--r--lib/Target/Hexagon/HexagonCopyToCombine.cpp32
-rw-r--r--lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td696
-rw-r--r--lib/Target/Hexagon/HexagonDepOperands.td83
-rw-r--r--lib/Target/Hexagon/HexagonEarlyIfConv.cpp24
-rw-r--r--lib/Target/Hexagon/HexagonExpandCondsets.cpp30
-rw-r--r--lib/Target/Hexagon/HexagonFixupHwLoops.cpp5
-rw-r--r--lib/Target/Hexagon/HexagonFrameLowering.cpp58
-rw-r--r--lib/Target/Hexagon/HexagonFrameLowering.h2
-rw-r--r--lib/Target/Hexagon/HexagonGenExtract.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonGenInsert.cpp27
-rw-r--r--lib/Target/Hexagon/HexagonGenMux.cpp6
-rw-r--r--lib/Target/Hexagon/HexagonGenPredicate.cpp14
-rw-r--r--lib/Target/Hexagon/HexagonHardwareLoops.cpp56
-rw-r--r--lib/Target/Hexagon/HexagonISelDAGToDAG.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonISelLowering.cpp156
-rw-r--r--lib/Target/Hexagon/HexagonISelLowering.h15
-rw-r--r--lib/Target/Hexagon/HexagonISelLoweringHVX.cpp24
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.cpp273
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.h22
-rw-r--r--lib/Target/Hexagon/HexagonIntrinsics.td46
-rw-r--r--lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp15
-rw-r--r--lib/Target/Hexagon/HexagonNewValueJump.cpp10
-rw-r--r--lib/Target/Hexagon/HexagonOptAddrMode.cpp12
-rw-r--r--lib/Target/Hexagon/HexagonPatterns.td194
-rw-r--r--lib/Target/Hexagon/HexagonPatternsHVX.td40
-rw-r--r--lib/Target/Hexagon/HexagonPeephole.cpp38
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.cpp6
-rw-r--r--lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp8
-rw-r--r--lib/Target/Hexagon/HexagonSplitDouble.cpp60
-rw-r--r--lib/Target/Hexagon/HexagonStoreWidening.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonSubtarget.cpp19
-rw-r--r--lib/Target/Hexagon/HexagonSubtarget.h2
-rw-r--r--lib/Target/Hexagon/HexagonTargetMachine.cpp12
-rw-r--r--lib/Target/Hexagon/HexagonTargetTransformInfo.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonTargetTransformInfo.h4
-rw-r--r--lib/Target/Hexagon/HexagonVExtract.cpp12
-rw-r--r--lib/Target/Hexagon/HexagonVLIWPacketizer.cpp37
-rw-r--r--lib/Target/Hexagon/HexagonVLIWPacketizer.h3
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp6
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp4
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp7
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp4
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp10
-rw-r--r--lib/Target/Hexagon/RDFCopy.cpp4
-rw-r--r--lib/Target/Hexagon/RDFDeadCode.cpp1
-rw-r--r--lib/Target/Hexagon/RDFGraph.cpp16
-rw-r--r--lib/Target/Hexagon/RDFLiveness.cpp8
-rw-r--r--lib/Target/Hexagon/RDFRegisters.cpp8
-rw-r--r--lib/Target/Hexagon/RDFRegisters.h8
-rw-r--r--lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp23
-rw-r--r--lib/Target/Lanai/LanaiAsmPrinter.cpp2
-rw-r--r--lib/Target/Lanai/LanaiDelaySlotFiller.cpp2
-rw-r--r--lib/Target/Lanai/LanaiFrameLowering.cpp4
-rw-r--r--lib/Target/Lanai/LanaiFrameLowering.h2
-rw-r--r--lib/Target/Lanai/LanaiISelLowering.cpp15
-rw-r--r--lib/Target/Lanai/LanaiISelLowering.h4
-rw-r--r--lib/Target/Lanai/LanaiInstrInfo.cpp9
-rw-r--r--lib/Target/Lanai/LanaiInstrInfo.h3
-rw-r--r--lib/Target/Lanai/LanaiRegisterInfo.cpp2
-rw-r--r--lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp2
-rw-r--r--lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp12
-rw-r--r--lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp4
-rw-r--r--lib/Target/MSP430/MSP430AsmPrinter.cpp8
-rw-r--r--lib/Target/MSP430/MSP430BranchSelector.cpp1
-rw-r--r--lib/Target/MSP430/MSP430FrameLowering.h3
-rw-r--r--lib/Target/MSP430/MSP430ISelLowering.cpp27
-rw-r--r--lib/Target/MSP430/MSP430ISelLowering.h2
-rw-r--r--lib/Target/MSP430/MSP430RegisterInfo.cpp2
-rw-r--r--lib/Target/MSP430/MSP430TargetMachine.cpp2
-rw-r--r--lib/Target/Mips/AsmParser/MipsAsmParser.cpp696
-rw-r--r--lib/Target/Mips/Disassembler/MipsDisassembler.cpp16
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp1
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h6
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp4
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp7
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h5
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp11
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp2
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp4
-rw-r--r--lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp96
-rw-r--r--lib/Target/Mips/MicroMipsDSPInstrInfo.td4
-rw-r--r--lib/Target/Mips/MicroMipsInstrInfo.td9
-rw-r--r--lib/Target/Mips/MicroMipsSizeReduction.cpp18
-rw-r--r--lib/Target/Mips/Mips.td12
-rw-r--r--lib/Target/Mips/Mips16ISelDAGToDAG.cpp2
-rw-r--r--lib/Target/Mips/Mips16ISelLowering.cpp16
-rw-r--r--lib/Target/Mips/Mips16InstrInfo.cpp2
-rw-r--r--lib/Target/Mips/Mips64InstrInfo.td36
-rw-r--r--lib/Target/Mips/MipsAsmPrinter.cpp12
-rw-r--r--lib/Target/Mips/MipsCallLowering.cpp150
-rw-r--r--lib/Target/Mips/MipsCallLowering.h8
-rw-r--r--lib/Target/Mips/MipsConstantIslandPass.cpp63
-rw-r--r--lib/Target/Mips/MipsDSPInstrInfo.td19
-rw-r--r--lib/Target/Mips/MipsExpandPseudo.cpp54
-rw-r--r--lib/Target/Mips/MipsFastISel.cpp12
-rw-r--r--lib/Target/Mips/MipsFrameLowering.h5
-rw-r--r--lib/Target/Mips/MipsISelDAGToDAG.cpp53
-rw-r--r--lib/Target/Mips/MipsISelDAGToDAG.h5
-rw-r--r--lib/Target/Mips/MipsISelLowering.cpp164
-rw-r--r--lib/Target/Mips/MipsISelLowering.h13
-rw-r--r--lib/Target/Mips/MipsInstrInfo.cpp3
-rw-r--r--lib/Target/Mips/MipsInstrInfo.h2
-rw-r--r--lib/Target/Mips/MipsInstrInfo.td30
-rw-r--r--lib/Target/Mips/MipsInstructionSelector.cpp206
-rw-r--r--lib/Target/Mips/MipsLegalizerInfo.cpp244
-rw-r--r--lib/Target/Mips/MipsLegalizerInfo.h3
-rw-r--r--lib/Target/Mips/MipsMSAInstrInfo.td55
-rw-r--r--lib/Target/Mips/MipsOptimizePICCall.cpp5
-rw-r--r--lib/Target/Mips/MipsPfmCounters.td18
-rw-r--r--lib/Target/Mips/MipsPreLegalizerCombiner.cpp3
-rw-r--r--lib/Target/Mips/MipsRegisterBankInfo.cpp322
-rw-r--r--lib/Target/Mips/MipsRegisterBankInfo.h9
-rw-r--r--lib/Target/Mips/MipsRegisterBanks.td2
-rw-r--r--lib/Target/Mips/MipsSEFrameLowering.cpp55
-rw-r--r--lib/Target/Mips/MipsSEISelDAGToDAG.cpp54
-rw-r--r--lib/Target/Mips/MipsSEISelDAGToDAG.h6
-rw-r--r--lib/Target/Mips/MipsSEISelLowering.cpp124
-rw-r--r--lib/Target/Mips/MipsSEInstrInfo.cpp20
-rw-r--r--lib/Target/Mips/MipsSERegisterInfo.cpp8
-rw-r--r--lib/Target/Mips/MipsSubtarget.cpp17
-rw-r--r--lib/Target/Mips/MipsSubtarget.h15
-rw-r--r--lib/Target/Mips/MipsTargetMachine.cpp18
-rw-r--r--lib/Target/Mips/MipsTargetStreamer.h14
-rw-r--r--lib/Target/NVPTX/NVPTX.h2
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.cpp34
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.h2
-rw-r--r--lib/Target/NVPTX/NVPTXFrameLowering.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.cpp58
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.td13
-rw-r--r--lib/Target/NVPTX/NVPTXIntrinsics.td169
-rw-r--r--lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXLowerAlloca.cpp97
-rw-r--r--lib/Target/NVPTX/NVPTXLowerArgs.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXPeephole.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXUtilities.cpp13
-rw-r--r--lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp10
-rw-r--r--lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp6
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp6
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp25
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp1
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp4
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h14
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp4
-rw-r--r--lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp2
-rw-r--r--lib/Target/PowerPC/P9InstrResources.td8
-rw-r--r--lib/Target/PowerPC/PPC.h8
-rw-r--r--lib/Target/PowerPC/PPCAsmPrinter.cpp482
-rw-r--r--lib/Target/PowerPC/PPCBranchCoalescing.cpp13
-rw-r--r--lib/Target/PowerPC/PPCBranchSelector.cpp29
-rw-r--r--lib/Target/PowerPC/PPCFastISel.cpp41
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.cpp71
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.h11
-rw-r--r--lib/Target/PowerPC/PPCISelDAGToDAG.cpp105
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp546
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h49
-rw-r--r--lib/Target/PowerPC/PPCInstr64Bit.td4
-rw-r--r--lib/Target/PowerPC/PPCInstrAltivec.td12
-rw-r--r--lib/Target/PowerPC/PPCInstrFormats.td9
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.cpp336
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.h42
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td206
-rw-r--r--lib/Target/PowerPC/PPCInstrVSX.td180
-rw-r--r--lib/Target/PowerPC/PPCLoopPreIncPrep.cpp670
-rw-r--r--lib/Target/PowerPC/PPCMCInstLower.cpp23
-rw-r--r--lib/Target/PowerPC/PPCMIPeephole.cpp82
-rw-r--r--lib/Target/PowerPC/PPCPreEmitPeephole.cpp106
-rw-r--r--lib/Target/PowerPC/PPCQPXLoadSplat.cpp6
-rw-r--r--lib/Target/PowerPC/PPCReduceCRLogicals.cpp15
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.cpp39
-rw-r--r--lib/Target/PowerPC/PPCRegisterInfo.td22
-rw-r--r--lib/Target/PowerPC/PPCSubtarget.cpp18
-rw-r--r--lib/Target/PowerPC/PPCSubtarget.h24
-rw-r--r--lib/Target/PowerPC/PPCTLSDynamicCall.cpp4
-rw-r--r--lib/Target/PowerPC/PPCTOCRegDeps.cpp9
-rw-r--r--lib/Target/PowerPC/PPCTargetMachine.cpp38
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.cpp68
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.h12
-rw-r--r--lib/Target/PowerPC/PPCVSXCopy.cpp6
-rw-r--r--lib/Target/PowerPC/PPCVSXFMAMutate.cpp18
-rw-r--r--lib/Target/PowerPC/PPCVSXSwapRemoval.cpp32
-rw-r--r--lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp282
-rw-r--r--lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp139
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp17
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp13
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp41
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h8
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp20
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h3
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp4
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h1
-rw-r--r--lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp4
-rw-r--r--lib/Target/RISCV/RISCV.h7
-rw-r--r--lib/Target/RISCV/RISCV.td11
-rw-r--r--lib/Target/RISCV/RISCVCallLowering.cpp50
-rw-r--r--lib/Target/RISCV/RISCVCallLowering.h42
-rw-r--r--lib/Target/RISCV/RISCVCallingConv.td28
-rw-r--r--lib/Target/RISCV/RISCVExpandPseudoInsts.cpp54
-rw-r--r--lib/Target/RISCV/RISCVFrameLowering.cpp164
-rw-r--r--lib/Target/RISCV/RISCVFrameLowering.h9
-rw-r--r--lib/Target/RISCV/RISCVISelDAGToDAG.cpp5
-rw-r--r--lib/Target/RISCV/RISCVISelLowering.cpp323
-rw-r--r--lib/Target/RISCV/RISCVISelLowering.h6
-rw-r--r--lib/Target/RISCV/RISCVInstrInfo.cpp118
-rw-r--r--lib/Target/RISCV/RISCVInstrInfo.h18
-rw-r--r--lib/Target/RISCV/RISCVInstrInfo.td22
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoA.td34
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoC.td124
-rw-r--r--lib/Target/RISCV/RISCVInstrInfoF.td6
-rw-r--r--lib/Target/RISCV/RISCVInstructionSelector.cpp103
-rw-r--r--lib/Target/RISCV/RISCVLegalizerInfo.cpp23
-rw-r--r--lib/Target/RISCV/RISCVLegalizerInfo.h28
-rw-r--r--lib/Target/RISCV/RISCVMergeBaseOffset.cpp16
-rw-r--r--lib/Target/RISCV/RISCVRegisterBankInfo.cpp26
-rw-r--r--lib/Target/RISCV/RISCVRegisterBankInfo.h37
-rw-r--r--lib/Target/RISCV/RISCVRegisterBanks.td13
-rw-r--r--lib/Target/RISCV/RISCVRegisterInfo.cpp13
-rw-r--r--lib/Target/RISCV/RISCVRegisterInfo.h6
-rw-r--r--lib/Target/RISCV/RISCVRegisterInfo.td100
-rw-r--r--lib/Target/RISCV/RISCVSubtarget.cpp30
-rw-r--r--lib/Target/RISCV/RISCVSubtarget.h20
-rw-r--r--lib/Target/RISCV/RISCVTargetMachine.cpp31
-rw-r--r--lib/Target/RISCV/Utils/RISCVBaseInfo.h16
-rw-r--r--lib/Target/Sparc/AsmParser/SparcAsmParser.cpp8
-rw-r--r--lib/Target/Sparc/DelaySlotFiller.cpp10
-rw-r--r--lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp6
-rw-r--r--lib/Target/Sparc/SparcFrameLowering.cpp3
-rw-r--r--lib/Target/Sparc/SparcISelDAGToDAG.cpp4
-rw-r--r--lib/Target/Sparc/SparcISelLowering.cpp28
-rw-r--r--lib/Target/Sparc/SparcISelLowering.h4
-rw-r--r--lib/Target/Sparc/SparcInstr64Bit.td2
-rw-r--r--lib/Target/Sparc/SparcInstrInfo.cpp4
-rw-r--r--lib/Target/Sparc/SparcInstrInfo.td8
-rw-r--r--lib/Target/Sparc/SparcRegisterInfo.cpp12
-rw-r--r--lib/Target/Sparc/SparcTargetMachine.cpp4
-rw-r--r--lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp12
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp2
-rw-r--r--lib/Target/SystemZ/SystemZ.h1
-rw-r--r--lib/Target/SystemZ/SystemZAsmPrinter.cpp20
-rw-r--r--lib/Target/SystemZ/SystemZAsmPrinter.h1
-rw-r--r--lib/Target/SystemZ/SystemZElimCompare.cpp9
-rw-r--r--lib/Target/SystemZ/SystemZExpandPseudo.cpp152
-rw-r--r--lib/Target/SystemZ/SystemZFrameLowering.cpp6
-rw-r--r--lib/Target/SystemZ/SystemZISelDAGToDAG.cpp11
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.cpp244
-rw-r--r--lib/Target/SystemZ/SystemZInstrFP.td32
-rw-r--r--lib/Target/SystemZ/SystemZInstrFormats.td166
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.cpp168
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.h29
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.td22
-rw-r--r--lib/Target/SystemZ/SystemZInstrVector.td26
-rw-r--r--lib/Target/SystemZ/SystemZLongBranch.cpp26
-rw-r--r--lib/Target/SystemZ/SystemZMachineScheduler.cpp5
-rw-r--r--lib/Target/SystemZ/SystemZOperands.td121
-rw-r--r--lib/Target/SystemZ/SystemZOperators.td6
-rw-r--r--lib/Target/SystemZ/SystemZPatterns.td4
-rw-r--r--lib/Target/SystemZ/SystemZPostRewrite.cpp164
-rw-r--r--lib/Target/SystemZ/SystemZProcessors.td3
-rw-r--r--lib/Target/SystemZ/SystemZRegisterInfo.cpp19
-rw-r--r--lib/Target/SystemZ/SystemZRegisterInfo.h9
-rw-r--r--lib/Target/SystemZ/SystemZSchedule.td2
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZ15.td (renamed from lib/Target/SystemZ/SystemZScheduleArch13.td)64
-rw-r--r--lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp8
-rw-r--r--lib/Target/SystemZ/SystemZShortenInst.cpp4
-rw-r--r--lib/Target/SystemZ/SystemZTargetMachine.cpp11
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.cpp5
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.h8
-rw-r--r--lib/Target/TargetLoweringObjectFile.cpp1
-rw-r--r--lib/Target/TargetMachine.cpp20
-rw-r--r--lib/Target/TargetMachineC.cpp2
-rw-r--r--lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp100
-rw-r--r--lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp24
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp10
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp57
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h3
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp1
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h74
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp33
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h3
-rw-r--r--lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp11
-rw-r--r--lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp38
-rw-r--r--lib/Target/WebAssembly/WebAssemblyCFGSort.cpp5
-rw-r--r--lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp127
-rw-r--r--lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp22
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFastISel.cpp36
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp3
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp8
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFrameLowering.h4
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISD.def2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp131
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISelLowering.cpp306
-rw-r--r--lib/Target/WebAssembly/WebAssemblyISelLowering.h2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrAtomics.td100
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td4
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrControl.td48
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrConv.td17
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp4
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.h2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.td3
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrMemory.td51
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrSIMD.td221
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp12
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp4
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp119
-rw-r--r--lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp55
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMCInstLower.h3
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp12
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h13
-rw-r--r--lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyPeephole.cpp107
-rw-r--r--lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegColoring.cpp7
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegStackify.cpp24
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp6
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp5
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h2
-rw-r--r--lib/Target/WebAssembly/WebAssemblyUtilities.cpp21
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp170
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParserCommon.h4
-rw-r--r--lib/Target/X86/AsmParser/X86Operand.h25
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp5
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp6
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp19
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp2
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp3
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp61
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h11
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp2
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp5
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp2
-rw-r--r--lib/Target/X86/X86.h10
-rw-r--r--lib/Target/X86/X86.td56
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp8
-rw-r--r--lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp3
-rw-r--r--lib/Target/X86/X86AvoidTrailingCall.cpp108
-rw-r--r--lib/Target/X86/X86CallFrameOptimization.cpp26
-rw-r--r--lib/Target/X86/X86CallLowering.cpp49
-rw-r--r--lib/Target/X86/X86CallLowering.h5
-rw-r--r--lib/Target/X86/X86CallingConv.td2
-rw-r--r--lib/Target/X86/X86CmovConversion.cpp18
-rw-r--r--lib/Target/X86/X86CondBrFolding.cpp2
-rw-r--r--lib/Target/X86/X86DomainReassignment.cpp20
-rwxr-xr-xlib/Target/X86/X86EvexToVex.cpp2
-rw-r--r--lib/Target/X86/X86ExpandPseudo.cpp11
-rw-r--r--lib/Target/X86/X86FastISel.cpp15
-rw-r--r--lib/Target/X86/X86FixupBWInsts.cpp68
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp201
-rw-r--r--lib/Target/X86/X86FixupSetCC.cpp4
-rw-r--r--lib/Target/X86/X86FlagsCopyLowering.cpp13
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp6
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp106
-rw-r--r--lib/Target/X86/X86FrameLowering.h4
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp304
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp5926
-rw-r--r--lib/Target/X86/X86ISelLowering.h77
-rw-r--r--lib/Target/X86/X86IndirectBranchTracking.cpp2
-rw-r--r--lib/Target/X86/X86InsertPrefetch.cpp8
-rw-r--r--lib/Target/X86/X86InstrAVX512.td1457
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td10
-rw-r--r--lib/Target/X86/X86InstrBuilder.h6
-rw-r--r--lib/Target/X86/X86InstrCMovSetCC.td33
-rw-r--r--lib/Target/X86/X86InstrCompiler.td139
-rw-r--r--lib/Target/X86/X86InstrControl.td85
-rw-r--r--lib/Target/X86/X86InstrExtension.td11
-rw-r--r--lib/Target/X86/X86InstrFoldTables.cpp287
-rw-r--r--lib/Target/X86/X86InstrFoldTables.h39
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td26
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp582
-rw-r--r--lib/Target/X86/X86InstrInfo.h28
-rw-r--r--lib/Target/X86/X86InstrInfo.td57
-rw-r--r--lib/Target/X86/X86InstrMMX.td33
-rw-r--r--lib/Target/X86/X86InstrMPX.td32
-rw-r--r--lib/Target/X86/X86InstrSSE.td551
-rw-r--r--lib/Target/X86/X86InstrSystem.td2
-rw-r--r--lib/Target/X86/X86InstrTSX.td2
-rw-r--r--lib/Target/X86/X86InstrXOP.td26
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp135
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h6
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp20
-rw-r--r--lib/Target/X86/X86LegalizerInfo.h3
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp313
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.h8
-rw-r--r--lib/Target/X86/X86OptimizeLEAs.cpp60
-rw-r--r--lib/Target/X86/X86RegisterBankInfo.cpp4
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp31
-rw-r--r--lib/Target/X86/X86RetpolineThunks.cpp8
-rwxr-xr-xlib/Target/X86/X86SchedBroadwell.td8
-rw-r--r--lib/Target/X86/X86SchedHaswell.td8
-rw-r--r--lib/Target/X86/X86SchedPredicates.td57
-rw-r--r--lib/Target/X86/X86SchedSandyBridge.td8
-rw-r--r--lib/Target/X86/X86SchedSkylakeClient.td8
-rwxr-xr-xlib/Target/X86/X86SchedSkylakeServer.td8
-rw-r--r--lib/Target/X86/X86Schedule.td24
-rw-r--r--lib/Target/X86/X86ScheduleAtom.td6
-rw-r--r--lib/Target/X86/X86ScheduleBdVer2.td6
-rw-r--r--lib/Target/X86/X86ScheduleBtVer2.td257
-rw-r--r--lib/Target/X86/X86ScheduleSLM.td8
-rw-r--r--lib/Target/X86/X86ScheduleZnver1.td8
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.cpp2
-rw-r--r--lib/Target/X86/X86SpeculativeLoadHardening.cpp59
-rw-r--r--lib/Target/X86/X86Subtarget.cpp18
-rw-r--r--lib/Target/X86/X86Subtarget.h23
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp49
-rw-r--r--lib/Target/X86/X86TargetMachine.h2
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp4
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h3
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp225
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.h11
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp6
-rw-r--r--lib/Target/X86/X86WinAllocaExpander.cpp4
-rw-r--r--lib/Target/X86/X86WinEHState.cpp5
-rw-r--r--lib/Target/XCore/XCoreAsmPrinter.cpp4
-rw-r--r--lib/Target/XCore/XCoreFrameLowering.cpp6
-rw-r--r--lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp2
-rw-r--r--lib/Target/XCore/XCoreISelLowering.cpp21
-rw-r--r--lib/Target/XCore/XCoreRegisterInfo.cpp2
-rw-r--r--lib/Target/XCore/XCoreTargetMachine.cpp2
-rw-r--r--lib/Target/XCore/XCoreTargetTransformInfo.h3
733 files changed, 38983 insertions, 20224 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 6965403a25ab..ac765ebcddc0 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -55,8 +55,9 @@ FunctionPass *createAArch64CollectLOHPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
AArch64Subtarget &, AArch64RegisterBankInfo &);
-FunctionPass *createAArch64PreLegalizeCombiner();
-FunctionPass *createAArch64StackTaggingPass();
+FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone);
+FunctionPass *createAArch64StackTaggingPass(bool MergeInit);
+FunctionPass *createAArch64StackTaggingPreRAPass();
void initializeAArch64A53Fix835769Pass(PassRegistry&);
void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
@@ -80,6 +81,7 @@ void initializeFalkorHWPFFixPass(PassRegistry&);
void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
void initializeLDTLSCleanupPass(PassRegistry&);
void initializeAArch64StackTaggingPass(PassRegistry&);
+void initializeAArch64StackTaggingPreRAPass(PassRegistry&);
} // end namespace llvm
#endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e39c6995e367..5b4c9e2149da 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -115,11 +115,12 @@ def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true",
def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true",
"Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>;
-def FeatureSVE2BitPerm : SubtargetFeature<"bitperm", "HasSVE2BitPerm", "true",
+def FeatureSVE2BitPerm : SubtargetFeature<"sve2-bitperm", "HasSVE2BitPerm", "true",
"Enable bit permutation SVE2 instructions", [FeatureSVE2]>;
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;
+
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
"Has zero-cycle zeroing instructions for generic registers">;
@@ -284,6 +285,10 @@ def FeatureSEL2 : SubtargetFeature<
"sel2", "HasSEL2", "true",
"Enable v8.4-A Secure Exception Level 2 extension">;
+def FeaturePMU : SubtargetFeature<
+ "pmu", "HasPMU", "true",
+ "Enable v8.4-A PMU extension">;
+
def FeatureTLB_RMI : SubtargetFeature<
"tlb-rmi", "HasTLB_RMI", "true",
"Enable v8.4-A TLB Range and Maintenance Instructions">;
@@ -345,6 +350,21 @@ def FeatureRandGen : SubtargetFeature<"rand", "HasRandGen",
def FeatureMTE : SubtargetFeature<"mte", "HasMTE",
"true", "Enable Memory Tagging Extension" >;
+def FeatureTRBE : SubtargetFeature<"trbe", "HasTRBE",
+ "true", "Enable Trace Buffer Extension">;
+
+def FeatureETE : SubtargetFeature<"ete", "HasETE",
+ "true", "Enable Embedded Trace Extension",
+ [FeatureTRBE]>;
+
+def FeatureTME : SubtargetFeature<"tme", "HasTME",
+ "true", "Enable Transactional Memory Extension" >;
+
+def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
+ "AllowTaggedGlobals",
+ "true", "Use an instruction sequence for taking the address of a global "
+ "that allows a memory tag in the upper address bits">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
@@ -354,7 +374,7 @@ def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
FeaturePAN, FeatureLOR, FeatureVH]>;
def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
- "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO,
+ "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO,
FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>;
def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
@@ -364,7 +384,7 @@ def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
"Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd,
FeatureNV, FeatureRASv8_4, FeatureMPAM, FeatureDIT,
- FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI,
+ FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeaturePMU, FeatureTLB_RMI,
FeatureFMI, FeatureRCPC_IMMO]>;
def HasV8_5aOps : SubtargetFeature<
@@ -390,6 +410,7 @@ include "AArch64Schedule.td"
include "AArch64InstrInfo.td"
include "AArch64SchedPredicates.td"
include "AArch64SchedPredExynos.td"
+include "AArch64Combine.td"
def AArch64InstrInfo : InstrInfo;
@@ -484,6 +505,19 @@ def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
FeaturePredictableSelectIsExpensive
]>;
+def ProcA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
+ "Cortex-A65 ARM processors", [
+ HasV8_2aOps,
+ FeatureCrypto,
+ FeatureDotProd,
+ FeatureFPARMv8,
+ FeatureFullFP16,
+ FeatureNEON,
+ FeatureRAS,
+ FeatureRCPC,
+ FeatureSSBS,
+ ]>;
+
def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
"Cortex-A72 ARM processors", [
FeatureCRC,
@@ -641,6 +675,33 @@ def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
FeatureSlowSTRQro
]>;
+def ProcNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily",
+ "NeoverseE1",
+ "Neoverse E1 ARM processors", [
+ HasV8_2aOps,
+ FeatureCrypto,
+ FeatureDotProd,
+ FeatureFPARMv8,
+ FeatureFullFP16,
+ FeatureNEON,
+ FeatureRCPC,
+ FeatureSSBS,
+ ]>;
+
+def ProcNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily",
+ "NeoverseN1",
+ "Neoverse N1 ARM processors", [
+ HasV8_2aOps,
+ FeatureCrypto,
+ FeatureDotProd,
+ FeatureFPARMv8,
+ FeatureFullFP16,
+ FeatureNEON,
+ FeatureRCPC,
+ FeatureSPE,
+ FeatureSSBS,
+ ]>;
+
def ProcSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
"Qualcomm Saphira processors", [
FeatureCrypto,
@@ -732,19 +793,28 @@ def : ProcessorModel<"generic", NoSchedModel, [
FeatureFuseAES,
FeatureNEON,
FeaturePerfMon,
- FeaturePostRAScheduler
+ FeaturePostRAScheduler,
+// ETE and TRBE are future architecture extensions. We temporariliy enable them
+// by default for users targeting generic AArch64, until it is decided in which
+// armv8.x-a architecture revision they will end up. The extensions do not
+// affect code generated by the compiler and can be used only by explicitly
+// mentioning the new system register names in assembly.
+ FeatureETE
]>;
-// FIXME: Cortex-A35 and Cortex-A55 are currently modeled as a Cortex-A53.
def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>;
def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
+def : ProcessorModel<"cortex-a65", CortexA53Model, [ProcA65]>;
+def : ProcessorModel<"cortex-a65ae", CortexA53Model, [ProcA65]>;
def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>;
def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>;
def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>;
+def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>;
+def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>;
def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 92c8c4955d50..13d389cec7a0 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -552,7 +552,7 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
std::vector<unsigned> ToErase;
for (auto &U : I.operands()) {
if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
- unsigned OrigReg = U.getReg();
+ Register OrigReg = U.getReg();
U.setReg(Substs[OrigReg]);
if (U.isKill())
// Don't erase straight away, because there may be other operands
@@ -611,12 +611,12 @@ void AArch64A57FPLoadBalancing::scanInstruction(
// Create a new chain. Multiplies don't require forwarding so can go on any
// unit.
- unsigned DestReg = MI->getOperand(0).getReg();
+ Register DestReg = MI->getOperand(0).getReg();
LLVM_DEBUG(dbgs() << "New chain started for register "
<< printReg(DestReg, TRI) << " at " << *MI);
- auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+ auto G = std::make_unique<Chain>(MI, Idx, getColor(DestReg));
ActiveChains[DestReg] = G.get();
AllChains.push_back(std::move(G));
@@ -624,8 +624,8 @@ void AArch64A57FPLoadBalancing::scanInstruction(
// It is beneficial to keep MLAs on the same functional unit as their
// accumulator operand.
- unsigned DestReg = MI->getOperand(0).getReg();
- unsigned AccumReg = MI->getOperand(3).getReg();
+ Register DestReg = MI->getOperand(0).getReg();
+ Register AccumReg = MI->getOperand(3).getReg();
maybeKillChain(MI->getOperand(1), Idx, ActiveChains);
maybeKillChain(MI->getOperand(2), Idx, ActiveChains);
@@ -661,7 +661,7 @@ void AArch64A57FPLoadBalancing::scanInstruction(
LLVM_DEBUG(dbgs() << "Creating new chain for dest register "
<< printReg(DestReg, TRI) << "\n");
- auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+ auto G = std::make_unique<Chain>(MI, Idx, getColor(DestReg));
ActiveChains[DestReg] = G.get();
AllChains.push_back(std::move(G));
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 89404463e1f0..981b366c14b1 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -105,14 +105,14 @@ static bool isGPR64(unsigned Reg, unsigned SubReg,
const MachineRegisterInfo *MRI) {
if (SubReg)
return false;
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
return AArch64::GPR64RegClass.contains(Reg);
}
static bool isFPR64(unsigned Reg, unsigned SubReg,
const MachineRegisterInfo *MRI) {
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
SubReg == 0) ||
(MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
@@ -201,8 +201,8 @@ bool AArch64AdvSIMDScalar::isProfitableToTransform(
unsigned NumNewCopies = 3;
unsigned NumRemovableCopies = 0;
- unsigned OrigSrc0 = MI.getOperand(1).getReg();
- unsigned OrigSrc1 = MI.getOperand(2).getReg();
+ Register OrigSrc0 = MI.getOperand(1).getReg();
+ Register OrigSrc1 = MI.getOperand(2).getReg();
unsigned SubReg0;
unsigned SubReg1;
if (!MRI->def_empty(OrigSrc0)) {
@@ -236,7 +236,7 @@ bool AArch64AdvSIMDScalar::isProfitableToTransform(
// any of the uses is a transformable instruction, it's likely the tranforms
// will chain, enabling us to save a copy there, too. This is an aggressive
// heuristic that approximates the graph based cost analysis described above.
- unsigned Dst = MI.getOperand(0).getReg();
+ Register Dst = MI.getOperand(0).getReg();
bool AllUsesAreCopies = true;
for (MachineRegisterInfo::use_instr_nodbg_iterator
Use = MRI->use_instr_nodbg_begin(Dst),
@@ -293,8 +293,8 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
assert(OldOpc != NewOpc && "transform an instruction to itself?!");
// Check if we need a copy for the source registers.
- unsigned OrigSrc0 = MI.getOperand(1).getReg();
- unsigned OrigSrc1 = MI.getOperand(2).getReg();
+ Register OrigSrc0 = MI.getOperand(1).getReg();
+ Register OrigSrc1 = MI.getOperand(2).getReg();
unsigned Src0 = 0, SubReg0;
unsigned Src1 = 0, SubReg1;
bool KillSrc0 = false, KillSrc1 = false;
@@ -354,7 +354,7 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
// Create a vreg for the destination.
// FIXME: No need to do this if the ultimate user expects an FPR64.
// Check for that and avoid the copy if possible.
- unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+ Register Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
// For now, all of the new instructions have the same simple three-register
// form, so no need to special case based on what instruction we're
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 094fbd999523..7ea7915c2ca6 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -99,7 +99,8 @@ public:
void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
- std::map<std::pair<unsigned, uint32_t>, MCSymbol *> HwasanMemaccessSymbols;
+ typedef std::tuple<unsigned, bool, uint32_t> HwasanMemaccessTuple;
+ std::map<HwasanMemaccessTuple, MCSymbol *> HwasanMemaccessSymbols;
void LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI);
void EmitHwasanMemaccessSymbols(Module &M);
@@ -150,7 +151,7 @@ private:
void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
bool printAsmRegInClass(const MachineOperand &MO,
- const TargetRegisterClass *RC, bool isVector,
+ const TargetRegisterClass *RC, unsigned AltName,
raw_ostream &O);
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
@@ -236,9 +237,12 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
}
void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
+ bool IsShort =
+ MI.getOpcode() == AArch64::HWASAN_CHECK_MEMACCESS_SHORTGRANULES;
uint32_t AccessInfo = MI.getOperand(1).getImm();
- MCSymbol *&Sym = HwasanMemaccessSymbols[{Reg, AccessInfo}];
+ MCSymbol *&Sym =
+ HwasanMemaccessSymbols[HwasanMemaccessTuple(Reg, IsShort, AccessInfo)];
if (!Sym) {
// FIXME: Make this work on non-ELF.
if (!TM.getTargetTriple().isOSBinFormatELF())
@@ -246,6 +250,8 @@ void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
std::string SymName = "__hwasan_check_x" + utostr(Reg - AArch64::X0) + "_" +
utostr(AccessInfo);
+ if (IsShort)
+ SymName += "_short";
Sym = OutContext.getOrCreateSymbol(SymName);
}
@@ -263,15 +269,22 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
std::unique_ptr<MCSubtargetInfo> STI(
TM.getTarget().createMCSubtargetInfo(TT.str(), "", ""));
- MCSymbol *HwasanTagMismatchSym =
+ MCSymbol *HwasanTagMismatchV1Sym =
OutContext.getOrCreateSymbol("__hwasan_tag_mismatch");
+ MCSymbol *HwasanTagMismatchV2Sym =
+ OutContext.getOrCreateSymbol("__hwasan_tag_mismatch_v2");
- const MCSymbolRefExpr *HwasanTagMismatchRef =
- MCSymbolRefExpr::create(HwasanTagMismatchSym, OutContext);
+ const MCSymbolRefExpr *HwasanTagMismatchV1Ref =
+ MCSymbolRefExpr::create(HwasanTagMismatchV1Sym, OutContext);
+ const MCSymbolRefExpr *HwasanTagMismatchV2Ref =
+ MCSymbolRefExpr::create(HwasanTagMismatchV2Sym, OutContext);
for (auto &P : HwasanMemaccessSymbols) {
- unsigned Reg = P.first.first;
- uint32_t AccessInfo = P.first.second;
+ unsigned Reg = std::get<0>(P.first);
+ bool IsShort = std::get<1>(P.first);
+ uint32_t AccessInfo = std::get<2>(P.first);
+ const MCSymbolRefExpr *HwasanTagMismatchRef =
+ IsShort ? HwasanTagMismatchV2Ref : HwasanTagMismatchV1Ref;
MCSymbol *Sym = P.second;
OutStreamer->SwitchSection(OutContext.getELFSection(
@@ -304,82 +317,86 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
.addReg(Reg)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
*STI);
- MCSymbol *HandlePartialSym = OutContext.createTempSymbol();
+ MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol();
OutStreamer->EmitInstruction(
MCInstBuilder(AArch64::Bcc)
.addImm(AArch64CC::NE)
- .addExpr(MCSymbolRefExpr::create(HandlePartialSym, OutContext)),
+ .addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym,
+ OutContext)),
*STI);
MCSymbol *ReturnSym = OutContext.createTempSymbol();
OutStreamer->EmitLabel(ReturnSym);
OutStreamer->EmitInstruction(
MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI);
+ OutStreamer->EmitLabel(HandleMismatchOrPartialSym);
- OutStreamer->EmitLabel(HandlePartialSym);
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri)
- .addReg(AArch64::WZR)
- .addReg(AArch64::W16)
- .addImm(15)
- .addImm(0),
- *STI);
- MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
- OutStreamer->EmitInstruction(
- MCInstBuilder(AArch64::Bcc)
- .addImm(AArch64CC::HI)
- .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
- *STI);
-
- OutStreamer->EmitInstruction(
- MCInstBuilder(AArch64::ANDXri)
- .addReg(AArch64::X17)
- .addReg(Reg)
- .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
- *STI);
- unsigned Size = 1 << (AccessInfo & 0xf);
- if (Size != 1)
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri)
- .addReg(AArch64::X17)
- .addReg(AArch64::X17)
- .addImm(Size - 1)
+ if (IsShort) {
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri)
+ .addReg(AArch64::WZR)
+ .addReg(AArch64::W16)
+ .addImm(15)
.addImm(0),
*STI);
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs)
- .addReg(AArch64::WZR)
- .addReg(AArch64::W16)
- .addReg(AArch64::W17)
- .addImm(0),
- *STI);
- OutStreamer->EmitInstruction(
- MCInstBuilder(AArch64::Bcc)
- .addImm(AArch64CC::LS)
- .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
- *STI);
-
- OutStreamer->EmitInstruction(
- MCInstBuilder(AArch64::ORRXri)
- .addReg(AArch64::X16)
- .addReg(Reg)
- .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
- *STI);
- OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui)
- .addReg(AArch64::W16)
- .addReg(AArch64::X16)
- .addImm(0),
- *STI);
- OutStreamer->EmitInstruction(
- MCInstBuilder(AArch64::SUBSXrs)
- .addReg(AArch64::XZR)
- .addReg(AArch64::X16)
- .addReg(Reg)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
- *STI);
- OutStreamer->EmitInstruction(
- MCInstBuilder(AArch64::Bcc)
- .addImm(AArch64CC::EQ)
- .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
- *STI);
+ MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::HI)
+ .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+ *STI);
+
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::ANDXri)
+ .addReg(AArch64::X17)
+ .addReg(Reg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
+ *STI);
+ unsigned Size = 1 << (AccessInfo & 0xf);
+ if (Size != 1)
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri)
+ .addReg(AArch64::X17)
+ .addReg(AArch64::X17)
+ .addImm(Size - 1)
+ .addImm(0),
+ *STI);
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs)
+ .addReg(AArch64::WZR)
+ .addReg(AArch64::W16)
+ .addReg(AArch64::W17)
+ .addImm(0),
+ *STI);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::LS)
+ .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+ *STI);
+
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::ORRXri)
+ .addReg(AArch64::X16)
+ .addReg(Reg)
+ .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
+ *STI);
+ OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui)
+ .addReg(AArch64::W16)
+ .addReg(AArch64::X16)
+ .addImm(0),
+ *STI);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::SUBSXrs)
+ .addReg(AArch64::XZR)
+ .addReg(AArch64::X16)
+ .addReg(Reg)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
+ *STI);
+ OutStreamer->EmitInstruction(
+ MCInstBuilder(AArch64::Bcc)
+ .addImm(AArch64CC::EQ)
+ .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
+ *STI);
+
+ OutStreamer->EmitLabel(HandleMismatchSym);
+ }
- OutStreamer->EmitLabel(HandleMismatchSym);
OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre)
.addReg(AArch64::SP)
.addReg(AArch64::X0)
@@ -414,16 +431,16 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
MCInstBuilder(AArch64::ADRP)
.addReg(AArch64::X16)
.addExpr(AArch64MCExpr::create(
- HwasanTagMismatchRef,
- AArch64MCExpr::VariantKind::VK_GOT_PAGE, OutContext)),
+ HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE,
+ OutContext)),
*STI);
OutStreamer->EmitInstruction(
MCInstBuilder(AArch64::LDRXui)
.addReg(AArch64::X16)
.addReg(AArch64::X16)
.addExpr(AArch64MCExpr::create(
- HwasanTagMismatchRef,
- AArch64MCExpr::VariantKind::VK_GOT_LO12, OutContext)),
+ HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12,
+ OutContext)),
*STI);
OutStreamer->EmitInstruction(
MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI);
@@ -485,15 +502,14 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
default:
llvm_unreachable("<unknown operand type>");
case MachineOperand::MO_Register: {
- unsigned Reg = MO.getReg();
- assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ Register Reg = MO.getReg();
+ assert(Register::isPhysicalRegister(Reg));
assert(!MO.getSubReg() && "Subregs should be eliminated!");
O << AArch64InstPrinter::getRegisterName(Reg);
break;
}
case MachineOperand::MO_Immediate: {
- int64_t Imm = MO.getImm();
- O << '#' << Imm;
+ O << MO.getImm();
break;
}
case MachineOperand::MO_GlobalAddress: {
@@ -510,7 +526,7 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
raw_ostream &O) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
switch (Mode) {
default:
return true; // Unknown mode.
@@ -531,14 +547,13 @@ bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
// printing.
bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
const TargetRegisterClass *RC,
- bool isVector, raw_ostream &O) {
+ unsigned AltName, raw_ostream &O) {
assert(MO.isReg() && "Should only get here with a register!");
const TargetRegisterInfo *RI = STI->getRegisterInfo();
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
assert(RI->regsOverlap(RegToPrint, Reg));
- O << AArch64InstPrinter::getRegisterName(
- RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
+ O << AArch64InstPrinter::getRegisterName(RegToPrint, AltName);
return false;
}
@@ -574,6 +589,7 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
case 's': // Print S register.
case 'd': // Print D register.
case 'q': // Print Q register.
+ case 'z': // Print Z register.
if (MO.isReg()) {
const TargetRegisterClass *RC;
switch (ExtraCode[0]) {
@@ -592,10 +608,13 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
case 'q':
RC = &AArch64::FPR128RegClass;
break;
+ case 'z':
+ RC = &AArch64::ZPRRegClass;
+ break;
default:
return true;
}
- return printAsmRegInClass(MO, RC, false /* vector */, O);
+ return printAsmRegInClass(MO, RC, AArch64::NoRegAltName, O);
}
printOperand(MI, OpNum, O);
return false;
@@ -605,16 +624,26 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
// According to ARM, we should emit x and v registers unless we have a
// modifier.
if (MO.isReg()) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
// If this is a w or x register, print an x register.
if (AArch64::GPR32allRegClass.contains(Reg) ||
AArch64::GPR64allRegClass.contains(Reg))
return printAsmMRegister(MO, 'x', O);
+ unsigned AltName = AArch64::NoRegAltName;
+ const TargetRegisterClass *RegClass;
+ if (AArch64::ZPRRegClass.contains(Reg)) {
+ RegClass = &AArch64::ZPRRegClass;
+ } else if (AArch64::PPRRegClass.contains(Reg)) {
+ RegClass = &AArch64::PPRRegClass;
+ } else {
+ RegClass = &AArch64::FPR128RegClass;
+ AltName = AArch64::vreg;
+ }
+
// If this is a b, h, s, d, or q register, print it as a v register.
- return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
- O);
+ return printAsmRegInClass(MO, RegClass, AltName, O);
}
printOperand(MI, OpNum, O);
@@ -682,7 +711,7 @@ void AArch64AsmPrinter::EmitJumpTableInfo() {
if (JTBBs.empty()) continue;
unsigned Size = AFI->getJumpTableEntrySize(JTI);
- EmitAlignment(Log2_32(Size));
+ EmitAlignment(Align(Size));
OutStreamer->EmitLabel(GetJTISymbol(JTI));
for (auto *JTBB : JTBBs)
@@ -725,12 +754,12 @@ void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
/// add xDest, xDest, xScratch, lsl #2
void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer,
const llvm::MachineInstr &MI) {
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned ScratchReg = MI.getOperand(1).getReg();
- unsigned ScratchRegW =
+ Register DestReg = MI.getOperand(0).getReg();
+ Register ScratchReg = MI.getOperand(1).getReg();
+ Register ScratchRegW =
STI->getRegisterInfo()->getSubReg(ScratchReg, AArch64::sub_32);
- unsigned TableReg = MI.getOperand(2).getReg();
- unsigned EntryReg = MI.getOperand(3).getReg();
+ Register TableReg = MI.getOperand(2).getReg();
+ Register EntryReg = MI.getOperand(3).getReg();
int JTIdx = MI.getOperand(4).getIndex();
bool IsByteEntry = MI.getOpcode() == AArch64::JumpTableDest8;
@@ -800,7 +829,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
if (CallTarget) {
assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
"High 16 bits of call target should be zero.");
- unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+ Register ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
EncodedBytes = 16;
// Materialize the jump address:
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi)
@@ -830,7 +859,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
}
void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
// Convert H/S/D register to corresponding Q register
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
@@ -894,32 +923,32 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
default:
break;
case AArch64::MOVMCSym: {
- unsigned DestReg = MI->getOperand(0).getReg();
- const MachineOperand &MO_Sym = MI->getOperand(1);
- MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym);
- MCOperand Hi_MCSym, Lo_MCSym;
-
- Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S);
- Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC);
-
- MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym);
- MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym);
-
- MCInst MovZ;
- MovZ.setOpcode(AArch64::MOVZXi);
- MovZ.addOperand(MCOperand::createReg(DestReg));
- MovZ.addOperand(Hi_MCSym);
- MovZ.addOperand(MCOperand::createImm(16));
- EmitToStreamer(*OutStreamer, MovZ);
-
- MCInst MovK;
- MovK.setOpcode(AArch64::MOVKXi);
- MovK.addOperand(MCOperand::createReg(DestReg));
- MovK.addOperand(MCOperand::createReg(DestReg));
- MovK.addOperand(Lo_MCSym);
- MovK.addOperand(MCOperand::createImm(0));
- EmitToStreamer(*OutStreamer, MovK);
- return;
+ Register DestReg = MI->getOperand(0).getReg();
+ const MachineOperand &MO_Sym = MI->getOperand(1);
+ MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym);
+ MCOperand Hi_MCSym, Lo_MCSym;
+
+ Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S);
+ Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC);
+
+ MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym);
+ MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym);
+
+ MCInst MovZ;
+ MovZ.setOpcode(AArch64::MOVZXi);
+ MovZ.addOperand(MCOperand::createReg(DestReg));
+ MovZ.addOperand(Hi_MCSym);
+ MovZ.addOperand(MCOperand::createImm(16));
+ EmitToStreamer(*OutStreamer, MovZ);
+
+ MCInst MovK;
+ MovK.setOpcode(AArch64::MOVKXi);
+ MovK.addOperand(MCOperand::createReg(DestReg));
+ MovK.addOperand(MCOperand::createReg(DestReg));
+ MovK.addOperand(Lo_MCSym);
+ MovK.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MovK);
+ return;
}
case AArch64::MOVIv2d_ns:
// If the target has <rdar://problem/16473581>, lower this
@@ -1084,6 +1113,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
case AArch64::HWASAN_CHECK_MEMACCESS:
+ case AArch64::HWASAN_CHECK_MEMACCESS_SHORTGRANULES:
LowerHWASAN_CHECK_MEMACCESS(*MI);
return;
@@ -1193,4 +1223,6 @@ extern "C" void LLVMInitializeAArch64AsmPrinter() {
RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
+ RegisterAsmPrinter<AArch64AsmPrinter> W(getTheARM64_32Target());
+ RegisterAsmPrinter<AArch64AsmPrinter> V(getTheAArch64_32Target());
}
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
index 59757769c89a..ed93d02aa615 100644
--- a/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -99,7 +99,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
/// (it's an implicit-def of the BL).
virtual void markPhysRegUsed(unsigned PhysReg) = 0;
- bool isArgumentHandler() const override { return true; }
+ bool isIncomingArgumentHandler() const override { return true; }
uint64_t StackUsed;
};
@@ -110,6 +110,7 @@ struct FormalArgHandler : public IncomingArgHandler {
: IncomingArgHandler(MIRBuilder, MRI, AssignFn) {}
void markPhysRegUsed(unsigned PhysReg) override {
+ MIRBuilder.getMRI()->addLiveIn(PhysReg);
MIRBuilder.getMBB().addLiveIn(PhysReg);
}
};
@@ -129,14 +130,29 @@ struct CallReturnHandler : public IncomingArgHandler {
struct OutgoingArgHandler : public CallLowering::ValueHandler {
OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
MachineInstrBuilder MIB, CCAssignFn *AssignFn,
- CCAssignFn *AssignFnVarArg)
+ CCAssignFn *AssignFnVarArg, bool IsTailCall = false,
+ int FPDiff = 0)
: ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
- AssignFnVarArg(AssignFnVarArg), StackSize(0) {}
+ AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff),
+ StackSize(0) {}
+
+ bool isIncomingArgumentHandler() const override { return false; }
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
+ MachineFunction &MF = MIRBuilder.getMF();
LLT p0 = LLT::pointer(0, 64);
LLT s64 = LLT::scalar(64);
+
+ if (IsTailCall) {
+ Offset += FPDiff;
+ int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
+ Register FIReg = MRI.createGenericVirtualRegister(p0);
+ MIRBuilder.buildFrameIndex(FIReg, FI);
+ MPO = MachinePointerInfo::getFixedStack(MF, FI);
+ return FIReg;
+ }
+
Register SPReg = MRI.createGenericVirtualRegister(p0);
MIRBuilder.buildCopy(SPReg, Register(AArch64::SP));
@@ -146,7 +162,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
Register AddrReg = MRI.createGenericVirtualRegister(p0);
MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
- MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+ MPO = MachinePointerInfo::getStack(MF, Offset);
return AddrReg;
}
@@ -173,12 +189,13 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info,
+ ISD::ArgFlagsTy Flags,
CCState &State) override {
bool Res;
if (Info.IsFixed)
- Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+ Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
else
- Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+ Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State);
StackSize = State.getNextStackOffset();
return Res;
@@ -186,10 +203,19 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
MachineInstrBuilder MIB;
CCAssignFn *AssignFnVarArg;
+ bool IsTailCall;
+
+ /// For tail calls, the byte offset of the call's argument area from the
+ /// callee's. Unused elsewhere.
+ int FPDiff;
uint64_t StackSize;
};
} // namespace
+static bool doesCalleeRestoreStack(CallingConv::ID CallConv, bool TailCallOpt) {
+ return CallConv == CallingConv::Fast && TailCallOpt;
+}
+
void AArch64CallLowering::splitToValueTypes(
const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const {
@@ -207,7 +233,7 @@ void AArch64CallLowering::splitToValueTypes(
// No splitting to do, but we want to replace the original type (e.g. [1 x
// double] -> double).
SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
- OrigArg.Flags, OrigArg.IsFixed);
+ OrigArg.Flags[0], OrigArg.IsFixed);
return;
}
@@ -218,13 +244,13 @@ void AArch64CallLowering::splitToValueTypes(
OrigArg.Ty, CallConv, false);
for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
- SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags,
+ SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0],
OrigArg.IsFixed);
if (NeedsRegBlock)
- SplitArgs.back().Flags.setInConsecutiveRegs();
+ SplitArgs.back().Flags[0].setInConsecutiveRegs();
}
- SplitArgs.back().Flags.setInConsecutiveRegsLast();
+ SplitArgs.back().Flags[0].setInConsecutiveRegsLast();
}
bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
@@ -344,6 +370,49 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
return Success;
}
+/// Helper function to compute forwarded registers for musttail calls. Computes
+/// the forwarded registers, sets MBB liveness, and emits COPY instructions that
+/// can be used to save + restore registers later.
+static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder,
+ CCAssignFn *AssignFn) {
+ MachineBasicBlock &MBB = MIRBuilder.getMBB();
+ MachineFunction &MF = MIRBuilder.getMF();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ if (!MFI.hasMustTailInVarArgFunc())
+ return;
+
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ const Function &F = MF.getFunction();
+ assert(F.isVarArg() && "Expected F to be vararg?");
+
+ // Compute the set of forwarded registers. The rest are scratch.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(F.getCallingConv(), /*IsVarArg=*/true, MF, ArgLocs,
+ F.getContext());
+ SmallVector<MVT, 2> RegParmTypes;
+ RegParmTypes.push_back(MVT::i64);
+ RegParmTypes.push_back(MVT::f128);
+
+ // Later on, we can use this vector to restore the registers if necessary.
+ SmallVectorImpl<ForwardedRegister> &Forwards =
+ FuncInfo->getForwardedMustTailRegParms();
+ CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn);
+
+ // Conservatively forward X8, since it might be used for an aggregate
+ // return.
+ if (!CCInfo.isAllocated(AArch64::X8)) {
+ unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
+ Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
+ }
+
+ // Add the forwards to the MachineBasicBlock and MachineFunction.
+ for (const auto &F : Forwards) {
+ MBB.addLiveIn(F.PReg);
+ MIRBuilder.buildCopy(Register(F.VReg), Register(F.PReg));
+ }
+}
+
bool AArch64CallLowering::lowerFormalArguments(
MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const {
@@ -376,64 +445,530 @@ bool AArch64CallLowering::lowerFormalArguments(
if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ uint64_t StackOffset = Handler.StackUsed;
if (F.isVarArg()) {
- if (!MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
- // FIXME: we need to reimplement saveVarArgsRegisters from
+ auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ if (!Subtarget.isTargetDarwin()) {
+ // FIXME: we need to reimplement saveVarArgsRegisters from
// AArch64ISelLowering.
return false;
}
- // We currently pass all varargs at 8-byte alignment.
- uint64_t StackOffset = alignTo(Handler.StackUsed, 8);
+ // We currently pass all varargs at 8-byte alignment, or 4 in ILP32.
+ StackOffset = alignTo(Handler.StackUsed, Subtarget.isTargetILP32() ? 4 : 8);
auto &MFI = MIRBuilder.getMF().getFrameInfo();
- AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
}
+ if (doesCalleeRestoreStack(F.getCallingConv(),
+ MF.getTarget().Options.GuaranteedTailCallOpt)) {
+ // We have a non-standard ABI, so why not make full use of the stack that
+ // we're going to pop? It must be aligned to 16 B in any case.
+ StackOffset = alignTo(StackOffset, 16);
+
+ // If we're expected to restore the stack (e.g. fastcc), then we'll be
+ // adding a multiple of 16.
+ FuncInfo->setArgumentStackToRestore(StackOffset);
+
+ // Our own callers will guarantee that the space is free by giving an
+ // aligned value to CALLSEQ_START.
+ }
+
+ // When we tail call, we need to check if the callee's arguments
+ // will fit on the caller's stack. So, whenever we lower formal arguments,
+ // we should keep track of this information, since we might lower a tail call
+ // in this function later.
+ FuncInfo->setBytesInStackArgArea(StackOffset);
+
auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
if (Subtarget.hasCustomCallingConv())
Subtarget.getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
+ handleMustTailForwardedRegisters(MIRBuilder, AssignFn);
+
// Move back to the end of the basic block.
MIRBuilder.setMBB(MBB);
return true;
}
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+ return CC == CallingConv::Fast;
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::C:
+ case CallingConv::PreserveMost:
+ case CallingConv::Swift:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
+/// CC.
+static std::pair<CCAssignFn *, CCAssignFn *>
+getAssignFnsForCC(CallingConv::ID CC, const AArch64TargetLowering &TLI) {
+ return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
+}
+
+bool AArch64CallLowering::doCallerAndCalleePassArgsTheSameWay(
+ CallLoweringInfo &Info, MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &InArgs) const {
+ const Function &CallerF = MF.getFunction();
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+
+ // If the calling conventions match, then everything must be the same.
+ if (CalleeCC == CallerCC)
+ return true;
+
+ // Check if the caller and callee will handle arguments in the same way.
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+ CCAssignFn *CalleeAssignFnFixed;
+ CCAssignFn *CalleeAssignFnVarArg;
+ std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) =
+ getAssignFnsForCC(CalleeCC, TLI);
+
+ CCAssignFn *CallerAssignFnFixed;
+ CCAssignFn *CallerAssignFnVarArg;
+ std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
+ getAssignFnsForCC(CallerCC, TLI);
+
+ if (!resultsCompatible(Info, MF, InArgs, *CalleeAssignFnFixed,
+ *CalleeAssignFnVarArg, *CallerAssignFnFixed,
+ *CallerAssignFnVarArg))
+ return false;
+
+ // Make sure that the caller and callee preserve all of the same registers.
+ auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+ const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+ const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+ if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv()) {
+ TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
+ TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
+ }
+
+ return TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved);
+}
+
+bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable(
+ CallLoweringInfo &Info, MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &OutArgs) const {
+ // If there are no outgoing arguments, then we are done.
+ if (OutArgs.empty())
+ return true;
+
+ const Function &CallerF = MF.getFunction();
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+
+ CCAssignFn *AssignFnFixed;
+ CCAssignFn *AssignFnVarArg;
+ std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
+
+ // We have outgoing arguments. Make sure that we can tail call with them.
+ SmallVector<CCValAssign, 16> OutLocs;
+ CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
+
+ if (!analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg)) {
+ LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
+ return false;
+ }
+
+ // Make sure that they can fit on the caller's stack.
+ const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) {
+ LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
+ return false;
+ }
+
+ // Verify that the parameters in callee-saved registers match.
+ // TODO: Port this over to CallLowering as general code once swiftself is
+ // supported.
+ auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+ const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ for (unsigned i = 0; i < OutLocs.size(); ++i) {
+ auto &ArgLoc = OutLocs[i];
+ // If it's not a register, it's fine.
+ if (!ArgLoc.isRegLoc()) {
+ if (Info.IsVarArg) {
+ // Be conservative and disallow variadic memory operands to match SDAG's
+ // behaviour.
+ // FIXME: If the caller's calling convention is C, then we can
+ // potentially use its argument area. However, for cases like fastcc,
+ // we can't do anything.
+ LLVM_DEBUG(
+ dbgs()
+ << "... Cannot tail call vararg function with stack arguments\n");
+ return false;
+ }
+ continue;
+ }
+
+ Register Reg = ArgLoc.getLocReg();
+
+ // Only look at callee-saved registers.
+ if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
+ continue;
+
+ LLVM_DEBUG(
+ dbgs()
+ << "... Call has an argument passed in a callee-saved register.\n");
+
+ // Check if it was copied from.
+ ArgInfo &OutInfo = OutArgs[i];
+
+ if (OutInfo.Regs.size() > 1) {
+ LLVM_DEBUG(
+ dbgs() << "... Cannot handle arguments in multiple registers.\n");
+ return false;
+ }
+
+ // Check if we copy the register, walking through copies from virtual
+ // registers. Note that getDefIgnoringCopies does not ignore copies from
+ // physical registers.
+ MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI);
+ if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) {
+ LLVM_DEBUG(
+ dbgs()
+ << "... Parameter was not copied into a VReg, cannot tail call.\n");
+ return false;
+ }
+
+ // Got a copy. Verify that it's the same as the register we want.
+ Register CopyRHS = RegDef->getOperand(1).getReg();
+ if (CopyRHS != Reg) {
+ LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into "
+ "VReg, cannot tail call.\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool AArch64CallLowering::isEligibleForTailCallOptimization(
+ MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &InArgs,
+ SmallVectorImpl<ArgInfo> &OutArgs) const {
+
+ // Must pass all target-independent checks in order to tail call optimize.
+ if (!Info.IsTailCall)
+ return false;
+
+ CallingConv::ID CalleeCC = Info.CallConv;
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &CallerF = MF.getFunction();
+
+ LLVM_DEBUG(dbgs() << "Attempting to lower call as tail call\n");
+
+ if (Info.SwiftErrorVReg) {
+ // TODO: We should handle this.
+ // Note that this is also handled by the check for no outgoing arguments.
+ // Proactively disabling this though, because the swifterror handling in
+ // lowerCall inserts a COPY *after* the location of the call.
+ LLVM_DEBUG(dbgs() << "... Cannot handle tail calls with swifterror yet.\n");
+ return false;
+ }
+
+ if (!mayTailCallThisCC(CalleeCC)) {
+ LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
+ return false;
+ }
+
+ // Byval parameters hand the function a pointer directly into the stack area
+ // we want to reuse during a tail call. Working around this *is* possible (see
+ // X86).
+ //
+ // FIXME: In AArch64ISelLowering, this isn't worked around. Can/should we try
+ // it?
+ //
+ // On Windows, "inreg" attributes signify non-aggregate indirect returns.
+ // In this case, it is necessary to save/restore X0 in the callee. Tail
+ // call opt interferes with this. So we disable tail call opt when the
+ // caller has an argument with "inreg" attribute.
+ //
+ // FIXME: Check whether the callee also has an "inreg" argument.
+ //
+ // When the caller has a swifterror argument, we don't want to tail call
+ // because would have to move into the swifterror register before the
+ // tail call.
+ if (any_of(CallerF.args(), [](const Argument &A) {
+ return A.hasByValAttr() || A.hasInRegAttr() || A.hasSwiftErrorAttr();
+ })) {
+ LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval, "
+ "inreg, or swifterror arguments\n");
+ return false;
+ }
+
+ // Externally-defined functions with weak linkage should not be
+ // tail-called on AArch64 when the OS does not support dynamic
+ // pre-emption of symbols, as the AAELF spec requires normal calls
+ // to undefined weak functions to be replaced with a NOP or jump to the
+ // next instruction. The behaviour of branch instructions in this
+ // situation (as used for tail calls) is implementation-defined, so we
+ // cannot rely on the linker replacing the tail call with a return.
+ if (Info.Callee.isGlobal()) {
+ const GlobalValue *GV = Info.Callee.getGlobal();
+ const Triple &TT = MF.getTarget().getTargetTriple();
+ if (GV->hasExternalWeakLinkage() &&
+ (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
+ TT.isOSBinFormatMachO())) {
+ LLVM_DEBUG(dbgs() << "... Cannot tail call externally-defined function "
+ "with weak linkage for this OS.\n");
+ return false;
+ }
+ }
+
+ // If we have -tailcallopt, then we're done.
+ if (MF.getTarget().Options.GuaranteedTailCallOpt)
+ return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv();
+
+ // We don't have -tailcallopt, so we're allowed to change the ABI (sibcall).
+ // Try to find cases where we can do that.
+
+ // I want anyone implementing a new calling convention to think long and hard
+ // about this assert.
+ assert((!Info.IsVarArg || CalleeCC == CallingConv::C) &&
+ "Unexpected variadic calling convention");
+
+ // Verify that the incoming and outgoing arguments from the callee are
+ // safe to tail call.
+ if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "... Caller and callee have incompatible calling conventions.\n");
+ return false;
+ }
+
+ if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
+ return false;
+
+ LLVM_DEBUG(
+ dbgs() << "... Call is eligible for tail call optimization.\n");
+ return true;
+}
+
+static unsigned getCallOpcode(const Function &CallerF, bool IsIndirect,
+ bool IsTailCall) {
+ if (!IsTailCall)
+ return IsIndirect ? AArch64::BLR : AArch64::BL;
+
+ if (!IsIndirect)
+ return AArch64::TCRETURNdi;
+
+ // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use
+ // x16 or x17.
+ if (CallerF.hasFnAttribute("branch-target-enforcement"))
+ return AArch64::TCRETURNriBTI;
+
+ return AArch64::TCRETURNri;
+}
+
+bool AArch64CallLowering::lowerTailCall(
+ MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &OutArgs) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = MF.getFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
+ // True when we're tail calling, but without -tailcallopt.
+ bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
+
+ // TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64
+ // register class. Until we can do that, we should fall back here.
+ if (F.hasFnAttribute("branch-target-enforcement")) {
+ LLVM_DEBUG(
+ dbgs() << "Cannot lower indirect tail calls with BTI enabled yet.\n");
+ return false;
+ }
+
+ // Find out which ABI gets to decide where things go.
+ CallingConv::ID CalleeCC = Info.CallConv;
+ CCAssignFn *AssignFnFixed;
+ CCAssignFn *AssignFnVarArg;
+ std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
+
+ MachineInstrBuilder CallSeqStart;
+ if (!IsSibCall)
+ CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
+
+ unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), true);
+ auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+ MIB.add(Info.Callee);
+
+ // Byte offset for the tail call. When we are sibcalling, this will always
+ // be 0.
+ MIB.addImm(0);
+
+ // Tell the call which registers are clobbered.
+ auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, F.getCallingConv());
+ if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv())
+ TRI->UpdateCustomCallPreservedMask(MF, &Mask);
+ MIB.addRegMask(Mask);
+
+ if (TRI->isAnyArgRegReserved(MF))
+ TRI->emitReservedArgRegCallError(MF);
+
+ // FPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0.
+ int FPDiff = 0;
+
+ // This will be 0 for sibcalls, potentially nonzero for tail calls produced
+ // by -tailcallopt. For sibcalls, the memory operands for the call are
+ // already available in the caller's incoming argument space.
+ unsigned NumBytes = 0;
+ if (!IsSibCall) {
+ // We aren't sibcalling, so we need to compute FPDiff. We need to do this
+ // before handling assignments, because FPDiff must be known for memory
+ // arguments.
+ unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+ SmallVector<CCValAssign, 16> OutLocs;
+ CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
+ analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg);
+
+ // The callee will pop the argument stack as a tail call. Thus, we must
+ // keep it 16-byte aligned.
+ NumBytes = alignTo(OutInfo.getNextStackOffset(), 16);
+
+ // FPDiff will be negative if this tail call requires more space than we
+ // would automatically have in our incoming argument space. Positive if we
+ // actually shrink the stack.
+ FPDiff = NumReusableBytes - NumBytes;
+
+ // The stack pointer must be 16-byte aligned at all times it's used for a
+ // memory operation, which in practice means at *all* times and in
+ // particular across call boundaries. Therefore our own arguments started at
+ // a 16-byte aligned SP and the delta applied for the tail call should
+ // satisfy the same constraint.
+ assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+ }
+
+ const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
+
+ // Do the actual argument marshalling.
+ SmallVector<unsigned, 8> PhysRegs;
+ OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
+ AssignFnVarArg, true, FPDiff);
+ if (!handleAssignments(MIRBuilder, OutArgs, Handler))
+ return false;
+
+ if (Info.IsVarArg && Info.IsMustTailCall) {
+ // Now we know what's being passed to the function. Add uses to the call for
+ // the forwarded registers that we *aren't* passing as parameters. This will
+ // preserve the copies we build earlier.
+ for (const auto &F : Forwards) {
+ Register ForwardedReg = F.PReg;
+ // If the register is already passed, or aliases a register which is
+ // already being passed, then skip it.
+ if (any_of(MIB->uses(), [&ForwardedReg, &TRI](const MachineOperand &Use) {
+ if (!Use.isReg())
+ return false;
+ return TRI->regsOverlap(Use.getReg(), ForwardedReg);
+ }))
+ continue;
+
+ // We aren't passing it already, so we should add it to the call.
+ MIRBuilder.buildCopy(ForwardedReg, Register(F.VReg));
+ MIB.addReg(ForwardedReg, RegState::Implicit);
+ }
+ }
+
+ // If we have -tailcallopt, we need to adjust the stack. We'll do the call
+ // sequence start and end here.
+ if (!IsSibCall) {
+ MIB->getOperand(1).setImm(FPDiff);
+ CallSeqStart.addImm(NumBytes).addImm(0);
+ // End the call sequence *before* emitting the call. Normally, we would
+ // tidy the frame up after the call. However, here, we've laid out the
+ // parameters so that when SP is reset, they will be in the correct
+ // location.
+ MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP).addImm(NumBytes).addImm(0);
+ }
+
+ // Now we can add the actual call instruction to the correct basic block.
+ MIRBuilder.insertInstr(MIB);
+
+ // If Callee is a reg, since it is used by a target specific instruction,
+ // it must have a register class matching the constraint of that instruction.
+ if (Info.Callee.isReg())
+ MIB->getOperand(0).setReg(constrainOperandRegClass(
+ MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
+ *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
+ 0));
+
+ MF.getFrameInfo().setHasTailCall();
+ Info.LoweredTailCall = true;
+ return true;
+}
+
bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
- CallingConv::ID CallConv,
- const MachineOperand &Callee,
- const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs,
- Register SwiftErrorVReg) const {
+ CallLoweringInfo &Info) const {
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
auto &DL = F.getParent()->getDataLayout();
+ const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
- SmallVector<ArgInfo, 8> SplitArgs;
- for (auto &OrigArg : OrigArgs) {
- splitToValueTypes(OrigArg, SplitArgs, DL, MRI, CallConv);
+ SmallVector<ArgInfo, 8> OutArgs;
+ for (auto &OrigArg : Info.OrigArgs) {
+ splitToValueTypes(OrigArg, OutArgs, DL, MRI, Info.CallConv);
// AAPCS requires that we zero-extend i1 to 8 bits by the caller.
if (OrigArg.Ty->isIntegerTy(1))
- SplitArgs.back().Flags.setZExt();
+ OutArgs.back().Flags[0].setZExt();
+ }
+
+ SmallVector<ArgInfo, 8> InArgs;
+ if (!Info.OrigRet.Ty->isVoidTy())
+ splitToValueTypes(Info.OrigRet, InArgs, DL, MRI, F.getCallingConv());
+
+ // If we can lower as a tail call, do that instead.
+ bool CanTailCallOpt =
+ isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs);
+
+ // We must emit a tail call if we have musttail.
+ if (Info.IsMustTailCall && !CanTailCallOpt) {
+ // There are types of incoming/outgoing arguments we can't handle yet, so
+ // it doesn't make sense to actually die here like in ISelLowering. Instead,
+ // fall back to SelectionDAG and let it try to handle this.
+ LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
+ return false;
}
+ if (CanTailCallOpt)
+ return lowerTailCall(MIRBuilder, Info, OutArgs);
+
// Find out which ABI gets to decide where things go.
- const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
- CCAssignFn *AssignFnFixed =
- TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
- CCAssignFn *AssignFnVarArg =
- TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/true);
+ CCAssignFn *AssignFnFixed;
+ CCAssignFn *AssignFnVarArg;
+ std::tie(AssignFnFixed, AssignFnVarArg) =
+ getAssignFnsForCC(Info.CallConv, TLI);
- auto CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
+ MachineInstrBuilder CallSeqStart;
+ CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
// Create a temporarily-floating call instruction so we can add the implicit
// uses of arg registers.
- auto MIB = MIRBuilder.buildInstrNoInsert(Callee.isReg() ? AArch64::BLR
- : AArch64::BL);
- MIB.add(Callee);
+ unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), false);
+
+ auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+ MIB.add(Info.Callee);
// Tell the call which registers are clobbered.
auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
@@ -448,8 +983,8 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Do the actual argument marshalling.
SmallVector<unsigned, 8> PhysRegs;
OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
- AssignFnVarArg);
- if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ AssignFnVarArg, false);
+ if (!handleAssignments(MIRBuilder, OutArgs, Handler))
return false;
// Now we can add the actual call instruction to the correct basic block.
@@ -458,34 +993,37 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// If Callee is a reg, since it is used by a target specific
// instruction, it must have a register class matching the
// constraint of that instruction.
- if (Callee.isReg())
+ if (Info.Callee.isReg())
MIB->getOperand(0).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
- *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0));
+ *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
+ 0));
// Finally we can copy the returned value back into its virtual-register. In
// symmetry with the arugments, the physical register must be an
// implicit-define of the call instruction.
- CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
- if (!OrigRet.Ty->isVoidTy()) {
- SplitArgs.clear();
-
- splitToValueTypes(OrigRet, SplitArgs, DL, MRI, F.getCallingConv());
-
+ if (!Info.OrigRet.Ty->isVoidTy()) {
+ CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
- if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ if (!handleAssignments(MIRBuilder, InArgs, Handler))
return false;
}
- if (SwiftErrorVReg) {
+ if (Info.SwiftErrorVReg) {
MIB.addDef(AArch64::X21, RegState::Implicit);
- MIRBuilder.buildCopy(SwiftErrorVReg, Register(AArch64::X21));
+ MIRBuilder.buildCopy(Info.SwiftErrorVReg, Register(AArch64::X21));
}
+ uint64_t CalleePopBytes =
+ doesCalleeRestoreStack(Info.CallConv,
+ MF.getTarget().Options.GuaranteedTailCallOpt)
+ ? alignTo(Handler.StackSize, 16)
+ : 0;
+
CallSeqStart.addImm(Handler.StackSize).addImm(0);
MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP)
.addImm(Handler.StackSize)
- .addImm(0);
+ .addImm(CalleePopBytes);
return true;
}
diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h
index 4f428f254537..b0c601c7062c 100644
--- a/lib/Target/AArch64/AArch64CallLowering.h
+++ b/lib/Target/AArch64/AArch64CallLowering.h
@@ -40,16 +40,15 @@ public:
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const override;
- bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
- const MachineOperand &Callee, const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs,
- Register SwiftErrorVReg) const override;
+ bool lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const override;
- bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
- const MachineOperand &Callee, const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const override {
- return lowerCall(MIRBuilder, CallConv, Callee, OrigRet, OrigArgs, 0);
- }
+ /// Returns true if the call can be lowered as a tail call.
+ bool
+ isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &InArgs,
+ SmallVectorImpl<ArgInfo> &OutArgs) const;
bool supportSwiftError() const override { return true; }
@@ -64,6 +63,18 @@ private:
SmallVectorImpl<ArgInfo> &SplitArgs,
const DataLayout &DL, MachineRegisterInfo &MRI,
CallingConv::ID CallConv) const;
+
+ bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
+ SmallVectorImpl<ArgInfo> &OutArgs) const;
+
+ bool
+ doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info,
+ MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &InArgs) const;
+
+ bool
+ areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF,
+ SmallVectorImpl<ArgInfo> &OutArgs) const;
};
} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64CallingConvention.cpp b/lib/Target/AArch64/AArch64CallingConvention.cpp
index 02538a187611..a0695cef615f 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -40,12 +40,14 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
CCState &State, unsigned SlotAlign) {
unsigned Size = LocVT.getSizeInBits() / 8;
- unsigned StackAlign =
+ const Align StackAlign =
State.getMachineFunction().getDataLayout().getStackAlignment();
- unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
+ const Align OrigAlign(ArgFlags.getOrigAlign());
+ const Align Align = std::min(OrigAlign, StackAlign);
for (auto &It : PendingMembers) {
- It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign)));
+ It.convertToMem(State.AllocateStack(
+ Size, std::max((unsigned)Align.value(), SlotAlign)));
State.addLoc(It);
SlotAlign = 1;
}
@@ -79,10 +81,14 @@ static bool CC_AArch64_Custom_Stack_Block(
static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+ State.getMachineFunction().getSubtarget());
+ bool IsDarwinILP32 = Subtarget.isTargetILP32() && Subtarget.isTargetMachO();
+
// Try to allocate a contiguous block of registers, each of the correct
// size to hold one member.
ArrayRef<MCPhysReg> RegList;
- if (LocVT.SimpleTy == MVT::i64)
+ if (LocVT.SimpleTy == MVT::i64 || (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32))
RegList = XRegList;
else if (LocVT.SimpleTy == MVT::f16)
RegList = HRegList;
@@ -107,8 +113,12 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
if (!ArgFlags.isInConsecutiveRegsLast())
return true;
- unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
- if (RegResult) {
+ // [N x i32] arguments get packed into x-registers on Darwin's arm64_32
+ // because that's how the armv7k Clang front-end emits small structs.
+ unsigned EltsPerReg = (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32) ? 2 : 1;
+ unsigned RegResult = State.AllocateRegBlock(
+ RegList, alignTo(PendingMembers.size(), EltsPerReg) / EltsPerReg);
+ if (RegResult && EltsPerReg == 1) {
for (auto &It : PendingMembers) {
It.convertToReg(RegResult);
State.addLoc(It);
@@ -116,14 +126,26 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
}
PendingMembers.clear();
return true;
+ } else if (RegResult) {
+ assert(EltsPerReg == 2 && "unexpected ABI");
+ bool UseHigh = false;
+ CCValAssign::LocInfo Info;
+ for (auto &It : PendingMembers) {
+ Info = UseHigh ? CCValAssign::AExtUpper : CCValAssign::ZExt;
+ State.addLoc(CCValAssign::getReg(It.getValNo(), MVT::i32, RegResult,
+ MVT::i64, Info));
+ UseHigh = !UseHigh;
+ if (!UseHigh)
+ ++RegResult;
+ }
+ PendingMembers.clear();
+ return true;
}
// Mark all regs in the class as unavailable
for (auto Reg : RegList)
State.AllocateReg(Reg);
- const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
- State.getMachineFunction().getSubtarget());
unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h
index 13cc0c583fd2..5a55d090d7c8 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -25,6 +25,9 @@ bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State);
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index d969a9e1ab3a..bccbbd4591ed 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -17,6 +17,10 @@ class CCIfAlign<string Align, CCAction A> :
class CCIfBigEndian<CCAction A> :
CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>;
+class CCIfILP32<CCAction A> :
+ CCIf<"State.getMachineFunction().getDataLayout().getPointerSize() == 4", A>;
+
+
//===----------------------------------------------------------------------===//
// ARM AAPCS64 Calling Convention
//===----------------------------------------------------------------------===//
@@ -70,6 +74,18 @@ def CC_AArch64_AAPCS : CallingConv<[
CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+ CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
+ nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+ CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
+ CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
+ nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+ CCPassIndirect<i64>>,
+
+ CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+ CCAssignToReg<[P0, P1, P2, P3]>>,
+ CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+ CCPassIndirect<i64>>,
+
// Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
// up to eight each of GPR and FPR.
CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -111,6 +127,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+ CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
// Big endian vectors must be passed as if they were 1-element vectors so that
@@ -135,7 +152,14 @@ def RetCC_AArch64_AAPCS : CallingConv<[
CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
- CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+ CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
+ nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+ CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
+
+ CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
+ CCAssignToReg<[P0, P1, P2, P3]>>
]>;
// Vararg functions on windows pass floats in integer registers
@@ -202,6 +226,12 @@ def CC_AArch64_DarwinPCS : CallingConv<[
CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+ // Re-demote pointers to 32-bits so we don't end up storing 64-bit
+ // values and clobbering neighbouring stack locations. Not very pretty.
+ CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
+ CCIfPtr<CCIfILP32<CCAssignToStack<4, 4>>>,
+
CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
CCAssignToStack<8, 8>>,
CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
@@ -229,6 +259,29 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
CCAssignToStack<16, 16>>
]>;
+// In the ILP32 world, the minimum stack slot size is 4 bytes. Otherwise the
+// same as the normal Darwin VarArgs handling.
+let Entry = 1 in
+def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
+ CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+ CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+ // Handle all scalar types as either i32 or f32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+ CCIfType<[f16], CCPromoteToType<f32>>,
+
+ // Everything is on the stack.
+ // i128 is split to two i64s, and its stack alignment is 16 bytes.
+ CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+ CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+ CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+ CCAssignToStack<8, 8>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCAssignToStack<16, 16>>
+]>;
+
+
// The WebKit_JS calling convention only passes the first argument (the callee)
// in register and the remaining arguments on stack. We allow 32bit stack slots,
// so that WebKit can write partial values in the stack and define the other
@@ -298,6 +351,12 @@ def CC_AArch64_GHC : CallingConv<[
CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>>
]>;
+// The order of the callee-saves in this file is important, because the
+// FrameLowering code will use this order to determine the layout the
+// callee-save area in the stack frame. As can be observed below, Darwin
+// requires the frame-record (LR, FP) to be at the top the callee-save area,
+// whereas for other platforms they are at the bottom.
+
// FIXME: LR is only callee-saved in the sense that *we* preserve it and are
// presumably a callee to someone. External functions may not do so, but this
// is currently safe since BL has LR as an implicit-def and what happens after a
@@ -306,7 +365,13 @@ def CC_AArch64_GHC : CallingConv<[
// It would be better to model its preservation semantics properly (create a
// vreg on entry, use it in RET & tail call generation; make that vreg def if we
// end up saving LR as part of a call frame). Watch this space...
-def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+ X25, X26, X27, X28, LR, FP,
+ D8, D9, D10, D11,
+ D12, D13, D14, D15)>;
+
+// Darwin puts the frame-record at the top of the callee-save area.
+def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
X23, X24, X25, X26, X27, X28,
D8, D9, D10, D11,
D12, D13, D14, D15)>;
@@ -314,17 +379,24 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
// Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x.
// We put FP before LR, so that frame lowering logic generates (FP,LR) pairs,
// and not (LR,FP) pairs.
-def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add FP, LR, X19, X20, X21, X22,
- X23, X24, X25, X26, X27, X28,
+def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+ X25, X26, X27, X28, FP, LR,
D8, D9, D10, D11,
D12, D13, D14, D15)>;
// AArch64 PCS for vector functions (VPCS)
// must (additionally) preserve full Q8-Q23 registers
-def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
- X23, X24, X25, X26, X27, X28,
+def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+ X25, X26, X27, X28, LR, FP,
(sequence "Q%u", 8, 23))>;
+// Functions taking SVE arguments or returning an SVE type
+// must (additionally) preserve full Z8-Z23 and predicate registers P4-P15
+def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
+ X25, X26, X27, X28, LR, FP,
+ (sequence "Z%u", 8, 23),
+ (sequence "P%u", 4, 15))>;
+
// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
// 'this' and the pointer return value are both passed in X0 in these cases,
// this can be partially modelled by treating X0 as a callee-saved register;
@@ -336,7 +408,7 @@ def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
def CSR_AArch64_AAPCS_SwiftError
- : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>;
+ : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>;
// The function used by Darwin to obtain the address of a thread-local variable
// guarantees more than a normal AAPCS function. x16 and x17 are used on the
@@ -352,7 +424,7 @@ def CSR_AArch64_TLS_Darwin
// fast path calls a function that follows CSR_AArch64_TLS_Darwin,
// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
def CSR_AArch64_CXX_TLS_Darwin
- : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+ : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS,
(sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
(sequence "D%u", 0, 31))>;
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 9f324b433209..35e6fef24363 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -103,6 +103,7 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -181,6 +182,7 @@ static bool canDefBePartOfLOH(const MachineInstr &MI) {
case AArch64::ADDXri:
return canAddBePartOfLOH(MI);
case AArch64::LDRXui:
+ case AArch64::LDRWui:
// Check immediate to see if the immediate is an address.
switch (MI.getOperand(2).getType()) {
default:
@@ -312,7 +314,8 @@ static void handleUse(const MachineInstr &MI, const MachineOperand &MO,
Info.Type = MCLOH_AdrpAdd;
Info.IsCandidate = true;
Info.MI0 = &MI;
- } else if (MI.getOpcode() == AArch64::LDRXui &&
+ } else if ((MI.getOpcode() == AArch64::LDRXui ||
+ MI.getOpcode() == AArch64::LDRWui) &&
MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) {
Info.Type = MCLOH_AdrpLdrGot;
Info.IsCandidate = true;
@@ -357,7 +360,9 @@ static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo,
return true;
}
} else {
- assert(MI.getOpcode() == AArch64::LDRXui && "Expect LDRXui");
+ assert((MI.getOpcode() == AArch64::LDRXui ||
+ MI.getOpcode() == AArch64::LDRWui) &&
+ "Expect LDRXui or LDRWui");
assert((MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) &&
"Expected GOT relocation");
if (OpInfo.Type == MCLOH_AdrpAddStr && OpInfo.MI1 == nullptr) {
@@ -474,13 +479,23 @@ static void handleNormalInst(const MachineInstr &MI, LOHInfo *LOHInfos) {
handleClobber(LOHInfos[Idx]);
}
// Handle uses.
+
+ SmallSet<int, 4> UsesSeen;
for (const MachineOperand &MO : MI.uses()) {
if (!MO.isReg() || !MO.readsReg())
continue;
int Idx = mapRegToGPRIndex(MO.getReg());
if (Idx < 0)
continue;
- handleUse(MI, MO, LOHInfos[Idx]);
+
+ // Multiple uses of the same register within a single instruction don't
+ // count as MultiUser or block optimization. This is especially important on
+ // arm64_32, where any memory operation is likely to be an explicit use of
+ // xN and an implicit use of wN (the base address register).
+ if (!UsesSeen.count(Idx)) {
+ handleUse(MI, MO, LOHInfos[Idx]);
+ UsesSeen.insert(Idx);
+ }
}
}
@@ -512,6 +527,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
switch (Opcode) {
case AArch64::ADDXri:
case AArch64::LDRXui:
+ case AArch64::LDRWui:
if (canDefBePartOfLOH(MI)) {
const MachineOperand &Def = MI.getOperand(0);
const MachineOperand &Op = MI.getOperand(1);
diff --git a/lib/Target/AArch64/AArch64Combine.td b/lib/Target/AArch64/AArch64Combine.td
new file mode 100644
index 000000000000..bb99f2516ecf
--- /dev/null
+++ b/lib/Target/AArch64/AArch64Combine.td
@@ -0,0 +1,18 @@
+//=- AArch64.td - Define AArch64 Combine Rules ---------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/GlobalISel/Combine.td"
+
+def AArch64PreLegalizerCombinerHelper: GICombinerHelper<
+ "AArch64GenPreLegalizerCombinerHelper", [all_combines,
+ elide_br_by_inverting_cond]> {
+ let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule";
+}
diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp
index 453132e09669..25e23e4623de 100644
--- a/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -78,7 +78,7 @@ void AArch64CondBrTuning::getAnalysisUsage(AnalysisUsage &AU) const {
}
MachineInstr *AArch64CondBrTuning::getOperandDef(const MachineOperand &MO) {
- if (!TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ if (!Register::isVirtualRegister(MO.getReg()))
return nullptr;
return MRI->getUniqueVRegDef(MO.getReg());
}
@@ -98,7 +98,7 @@ MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI,
}
bool Is64Bit;
unsigned NewOpc = TII->convertToFlagSettingOpc(MI.getOpcode(), Is64Bit);
- unsigned NewDestReg = MI.getOperand(0).getReg();
+ Register NewDestReg = MI.getOperand(0).getReg();
if (MRI->hasOneNonDBGUse(MI.getOperand(0).getReg()))
NewDestReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 2cfbcc592d6a..43ae9f8ec47f 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -220,7 +220,7 @@ bool SSACCmpConv::trivialTailPHIs() {
// PHI operands come in (VReg, MBB) pairs.
for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
- unsigned Reg = I.getOperand(oi).getReg();
+ Register Reg = I.getOperand(oi).getReg();
if (MBB == Head) {
assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
HeadReg = Reg;
@@ -259,7 +259,7 @@ bool SSACCmpConv::isDeadDef(unsigned DstReg) {
// Writes to the zero register are dead.
if (DstReg == AArch64::WZR || DstReg == AArch64::XZR)
return true;
- if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+ if (!Register::isVirtualRegister(DstReg))
return false;
// A virtual register def without any uses will be marked dead later, and
// eventually replaced by the zero register.
@@ -631,7 +631,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
}
const MCInstrDesc &MCID = TII->get(Opc);
// Create a dummy virtual register for the SUBS def.
- unsigned DestReg =
+ Register DestReg =
MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
// Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
BuildMI(*Head, Head->end(), TermDL, MCID)
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index a43077cb88ec..bc3808df1dbc 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -145,8 +145,8 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
continue;
// We should not have any relevant physreg defs that are replacable by
// zero before register allocation. So we just check for dead vreg defs.
- unsigned Reg = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+ Register Reg = MO.getReg();
+ if (!Register::isVirtualRegister(Reg) ||
(!MO.isDead() && !MRI->use_nodbg_empty(Reg)))
continue;
assert(!MO.isImplicit() && "Unexpected implicit def!");
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 210c10eb1842..082e17e44d04 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -109,7 +109,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
unsigned BitSize) {
MachineInstr &MI = *MBBI;
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
uint64_t Imm = MI.getOperand(1).getImm();
if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) {
@@ -150,7 +150,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
} break;
case AArch64::MOVKWi:
case AArch64::MOVKXi: {
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
.addReg(DstReg,
@@ -174,14 +174,14 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
const MachineOperand &Dest = MI.getOperand(0);
- unsigned StatusReg = MI.getOperand(1).getReg();
+ Register StatusReg = MI.getOperand(1).getReg();
bool StatusDead = MI.getOperand(1).isDead();
// Duplicating undef operands into 2 instructions does not guarantee the same
// value on both; However undef should be replaced by xzr anyway.
assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
- unsigned AddrReg = MI.getOperand(2).getReg();
- unsigned DesiredReg = MI.getOperand(3).getReg();
- unsigned NewReg = MI.getOperand(4).getReg();
+ Register AddrReg = MI.getOperand(2).getReg();
+ Register DesiredReg = MI.getOperand(3).getReg();
+ Register NewReg = MI.getOperand(4).getReg();
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -254,16 +254,16 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
DebugLoc DL = MI.getDebugLoc();
MachineOperand &DestLo = MI.getOperand(0);
MachineOperand &DestHi = MI.getOperand(1);
- unsigned StatusReg = MI.getOperand(2).getReg();
+ Register StatusReg = MI.getOperand(2).getReg();
bool StatusDead = MI.getOperand(2).isDead();
// Duplicating undef operands into 2 instructions does not guarantee the same
// value on both; However undef should be replaced by xzr anyway.
assert(!MI.getOperand(3).isUndef() && "cannot handle undef");
- unsigned AddrReg = MI.getOperand(3).getReg();
- unsigned DesiredLoReg = MI.getOperand(4).getReg();
- unsigned DesiredHiReg = MI.getOperand(5).getReg();
- unsigned NewLoReg = MI.getOperand(6).getReg();
- unsigned NewHiReg = MI.getOperand(7).getReg();
+ Register AddrReg = MI.getOperand(3).getReg();
+ Register DesiredLoReg = MI.getOperand(4).getReg();
+ Register DesiredHiReg = MI.getOperand(5).getReg();
+ Register NewLoReg = MI.getOperand(6).getReg();
+ Register NewHiReg = MI.getOperand(7).getReg();
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -475,7 +475,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case AArch64::LOADgot: {
MachineFunction *MF = MBB.getParent();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
const MachineOperand &MO1 = MI.getOperand(1);
unsigned Flags = MO1.getTargetFlags();
@@ -495,12 +495,26 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
}
} else {
// Small codemodel expand into ADRP + LDR.
+ MachineFunction &MF = *MI.getParent()->getParent();
+ DebugLoc DL = MI.getDebugLoc();
MachineInstrBuilder MIB1 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
- MachineInstrBuilder MIB2 =
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
- .add(MI.getOperand(0))
- .addReg(DstReg);
+
+ MachineInstrBuilder MIB2;
+ if (MF.getSubtarget<AArch64Subtarget>().isTargetILP32()) {
+ auto TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
+ unsigned Reg32 = TRI->getSubReg(DstReg, AArch64::sub_32);
+ unsigned DstFlags = MI.getOperand(0).getTargetFlags();
+ MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRWui))
+ .addDef(Reg32)
+ .addReg(DstReg, RegState::Kill)
+ .addReg(DstReg, DstFlags | RegState::Implicit);
+ } else {
+ unsigned DstReg = MI.getOperand(0).getReg();
+ MIB2 = BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRXui))
+ .add(MI.getOperand(0))
+ .addUse(DstReg, RegState::Kill);
+ }
if (MO1.isGlobal()) {
MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
@@ -534,11 +548,28 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case AArch64::MOVaddrTLS:
case AArch64::MOVaddrEXT: {
// Expand into ADRP + ADD.
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
MachineInstrBuilder MIB1 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
.add(MI.getOperand(1));
+ if (MI.getOperand(1).getTargetFlags() & AArch64II::MO_TAGGED) {
+ // MO_TAGGED on the page indicates a tagged address. Set the tag now.
+ // We do so by creating a MOVK that sets bits 48-63 of the register to
+ // (global address + 0x100000000 - PC) >> 48. This assumes that we're in
+ // the small code model so we can assume a binary size of <= 4GB, which
+ // makes the untagged PC relative offset positive. The binary must also be
+ // loaded into address range [0, 2^48). Both of these properties need to
+ // be ensured at runtime when using tagged addresses.
+ auto Tag = MI.getOperand(1);
+ Tag.setTargetFlags(AArch64II::MO_PREL | AArch64II::MO_G3);
+ Tag.setOffset(0x100000000);
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi), DstReg)
+ .addReg(DstReg)
+ .add(Tag)
+ .addImm(48);
+ }
+
MachineInstrBuilder MIB2 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
.add(MI.getOperand(0))
@@ -561,7 +592,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return true;
case AArch64::MOVbaseTLS: {
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
auto SysReg = AArch64SysReg::TPIDR_EL0;
MachineFunction *MF = MBB.getParent();
if (MF->getTarget().getTargetTriple().isOSFuchsia() &&
@@ -642,11 +673,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
// instruction sequence.
int BaseOffset = -AFI->getTaggedBasePointerOffset();
unsigned FrameReg;
- int FrameRegOffset = TFI->resolveFrameOffsetReference(
- MF, BaseOffset, false /*isFixed*/, FrameReg, /*PreferFP=*/false,
+ StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference(
+ MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg,
+ /*PreferFP=*/false,
/*ForSimm=*/true);
Register SrcReg = FrameReg;
- if (FrameRegOffset != 0) {
+ if (FrameRegOffset) {
// Use output register as temporary.
SrcReg = MI.getOperand(0).getReg();
emitFrameOffset(MBB, &MI, MI.getDebugLoc(), SrcReg, FrameReg,
diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index 3b3182128c4c..b54fc2e51bac 100644
--- a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -642,7 +642,7 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
}
// Loads from the stack pointer don't get prefetched.
- unsigned BaseReg = MI.getOperand(BaseRegIdx).getReg();
+ Register BaseReg = MI.getOperand(BaseRegIdx).getReg();
if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP)
return None;
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 8dc2768b9597..277a3052f1e5 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -459,7 +459,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO())
return 0;
- unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+ unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
EVT DestEVT = TLI.getValueType(DL, GV->getType(), true);
if (!DestEVT.isSimple())
@@ -474,12 +474,32 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
ADRPReg)
.addGlobalAddress(GV, 0, AArch64II::MO_PAGE | OpFlags);
- ResultReg = createResultReg(&AArch64::GPR64RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+ unsigned LdrOpc;
+ if (Subtarget->isTargetILP32()) {
+ ResultReg = createResultReg(&AArch64::GPR32RegClass);
+ LdrOpc = AArch64::LDRWui;
+ } else {
+ ResultReg = createResultReg(&AArch64::GPR64RegClass);
+ LdrOpc = AArch64::LDRXui;
+ }
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(LdrOpc),
ResultReg)
- .addReg(ADRPReg)
- .addGlobalAddress(GV, 0,
- AArch64II::MO_PAGEOFF | AArch64II::MO_NC | OpFlags);
+ .addReg(ADRPReg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+ AArch64II::MO_NC | OpFlags);
+ if (!Subtarget->isTargetILP32())
+ return ResultReg;
+
+ // LDRWui produces a 32-bit register, but pointers in-register are 64-bits
+ // so we must extend the result on ILP32.
+ unsigned Result64 = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG))
+ .addDef(Result64)
+ .addImm(0)
+ .addReg(ResultReg, RegState::Kill)
+ .addImm(AArch64::sub_32);
+ return Result64;
} else {
// ADRP + ADDX
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
@@ -504,6 +524,15 @@ unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) {
if (!CEVT.isSimple())
return 0;
MVT VT = CEVT.getSimpleVT();
+ // arm64_32 has 32-bit pointers held in 64-bit registers. Because of that,
+ // 'null' pointers need to have a somewhat special treatment.
+ if (const auto *CPN = dyn_cast<ConstantPointerNull>(C)) {
+ (void)CPN;
+ assert(CPN->getType()->getPointerAddressSpace() == 0 &&
+ "Unexpected address space");
+ assert(VT == MVT::i64 && "Expected 64-bit pointers");
+ return materializeInt(ConstantInt::get(Type::getInt64Ty(*Context), 0), VT);
+ }
if (const auto *CI = dyn_cast<ConstantInt>(C))
return materializeInt(CI, VT);
@@ -946,6 +975,9 @@ bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
EVT evt = TLI.getValueType(DL, Ty, true);
+ if (Subtarget->isTargetILP32() && Ty->isPointerTy())
+ return false;
+
// Only handle simple types.
if (evt == MVT::Other || !evt.isSimple())
return false;
@@ -988,6 +1020,9 @@ bool AArch64FastISel::isValueAvailable(const Value *V) const {
}
bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
+ if (Subtarget->isTargetILP32())
+ return false;
+
unsigned ScaleFactor = getImplicitScaleFactor(VT);
if (!ScaleFactor)
return false;
@@ -3165,6 +3200,11 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
if (IsTailCall)
return false;
+ // FIXME: we could and should support this, but for now correctness at -O0 is
+ // more important.
+ if (Subtarget->isTargetILP32())
+ return false;
+
CodeModel::Model CM = TM.getCodeModel();
// Only support the small-addressing and large code models.
if (CM != CodeModel::Large && !Subtarget->useSmallAddressing())
@@ -3434,8 +3474,8 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
MFI.setFrameAddressIsTaken(true);
const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
- unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
- unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ Register FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+ Register SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr);
// Recursively load frame address
@@ -3796,6 +3836,11 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
if (!FuncInfo.CanLowerReturn)
return false;
+ // FIXME: in principle it could. Mostly just a case of zero extending outgoing
+ // pointers.
+ if (Subtarget->isTargetILP32())
+ return false;
+
if (F.isVarArg())
return false;
@@ -3842,7 +3887,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
return false;
unsigned SrcReg = Reg + VA.getValNo();
- unsigned DestReg = VA.getLocReg();
+ Register DestReg = VA.getLocReg();
// Avoid a cross-class copy. This is very unlikely.
if (!MRI.getRegClass(SrcReg)->contains(DestReg))
return false;
@@ -3970,7 +4015,7 @@ unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) {
if (DestVT == MVT::i64) {
// We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the
// upper 32 bits. Emit a SUBREG_TO_REG to extend from Wd to Xd.
- unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ Register Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), Reg64)
.addImm(0)
@@ -4123,7 +4168,7 @@ unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
};
unsigned Opc = OpcTable[IsZExt][Is64Bit];
if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
- unsigned TmpReg = MRI.createVirtualRegister(RC);
+ Register TmpReg = MRI.createVirtualRegister(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), TmpReg)
.addImm(0)
@@ -4244,7 +4289,7 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
};
unsigned Opc = OpcTable[IsZExt][Is64Bit];
if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
- unsigned TmpReg = MRI.createVirtualRegister(RC);
+ Register TmpReg = MRI.createVirtualRegister(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), TmpReg)
.addImm(0)
@@ -4353,7 +4398,7 @@ unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
};
unsigned Opc = OpcTable[IsZExt][Is64Bit];
if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
- unsigned TmpReg = MRI.createVirtualRegister(RC);
+ Register TmpReg = MRI.createVirtualRegister(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), TmpReg)
.addImm(0)
@@ -4412,7 +4457,7 @@ unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
if (DestVT == MVT::i8 || DestVT == MVT::i16)
DestVT = MVT::i32;
else if (DestVT == MVT::i64) {
- unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ Register Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::SUBREG_TO_REG), Src64)
.addImm(0)
@@ -4495,7 +4540,7 @@ bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT,
const auto *LoadMI = MI;
if (LoadMI->getOpcode() == TargetOpcode::COPY &&
LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) {
- unsigned LoadReg = MI->getOperand(1).getReg();
+ Register LoadReg = MI->getOperand(1).getReg();
LoadMI = MRI.getUniqueVRegDef(LoadReg);
assert(LoadMI && "Expected valid instruction");
}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 8c6e5cbd5c13..68e1e6a30224 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -44,11 +44,19 @@
// | |
// |-----------------------------------|
// | |
-// | prev_fp, prev_lr |
+// | callee-saved gpr registers | <--.
+// | | | On Darwin platforms these
+// |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
+// | | | (frame record first)
+// | prev_fp, prev_lr | <--'
// | (a.k.a. "frame record") |
// |-----------------------------------| <- fp(=x29)
// | |
-// | other callee-saved registers |
+// | callee-saved fp/simd/SVE regs |
+// | |
+// |-----------------------------------|
+// | |
+// | SVE stack objects |
// | |
// |-----------------------------------|
// |.empty.space.to.make.part.below....|
@@ -80,6 +88,20 @@
// * A frame pointer is definitely needed when there are local variables with
// more-than-default alignment requirements.
//
+// For Darwin platforms the frame-record (fp, lr) is stored at the top of the
+// callee-saved area, since the unwind encoding does not allow for encoding
+// this dynamically and existing tools depend on this layout. For other
+// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
+// area to allow SVE stack objects (allocated directly below the callee-saves,
+// if available) to be accessed directly from the framepointer.
+// The SVE spill/fill instructions have VL-scaled addressing modes such
+// as:
+// ldr z8, [fp, #-7 mul vl]
+// For SVE the size of the vector length (VL) is not known at compile-time, so
+// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
+// layout, we don't need to add an unscaled offset to the framepointer before
+// accessing the SVE object in the frame.
+//
// In some cases when a base pointer is not strictly needed, it is generated
// anyway when offsets from the frame pointer to access local variables become
// so large that the offset can't be encoded in the immediate fields of loads
@@ -94,6 +116,7 @@
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64RegisterInfo.h"
+#include "AArch64StackOffset.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
@@ -173,7 +196,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
if (!MO.isFI())
continue;
- int Offset = 0;
+ StackOffset Offset;
if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
AArch64FrameOffsetCannotUpdate)
return 0;
@@ -183,6 +206,12 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
return DefaultSafeSPDisplacement;
}
+/// Returns the size of the entire SVE stackframe (calleesaves + spills).
+static StackOffset getSVEStackSize(const MachineFunction &MF) {
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8};
+}
+
bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
if (!EnableRedZone)
return false;
@@ -195,7 +224,8 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
unsigned NumBytes = AFI->getLocalStackSize();
- return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128);
+ return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 ||
+ getSVEStackSize(MF));
}
/// hasFP - Return true if the specified function should have a dedicated frame
@@ -273,14 +303,15 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
// Most call frames will be allocated at the start of a function so
// this is OK, but it is a limitation that needs dealing with.
assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
- emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
+ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, {Amount, MVT::i8},
+ TII);
}
} else if (CalleePopAmount != 0) {
// If the calling convention demands that the callee pops arguments from the
// stack, we want to add it back if we have a reserved call frame.
assert(CalleePopAmount < 0xffffff && "call frame too large");
- emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
- TII);
+ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
+ {-(int64_t)CalleePopAmount, MVT::i8}, TII);
}
return MBB.erase(I);
}
@@ -416,6 +447,9 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ if (MF.getFunction().hasOptSize())
+ return false;
+
if (AFI->getLocalStackSize() == 0)
return false;
@@ -436,6 +470,11 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
if (canUseRedZone(MF))
return false;
+ // When there is an SVE area on the stack, always allocate the
+ // callee-saves and spills/locals separately.
+ if (getSVEStackSize(MF))
+ return false;
+
return true;
}
@@ -474,8 +513,8 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
Imm = -Imm;
LLVM_FALLTHROUGH;
case AArch64::STPXpre: {
- unsigned Reg0 = MBBI->getOperand(1).getReg();
- unsigned Reg1 = MBBI->getOperand(2).getReg();
+ Register Reg0 = MBBI->getOperand(1).getReg();
+ Register Reg1 = MBBI->getOperand(2).getReg();
if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
.addImm(Imm * 8)
@@ -523,8 +562,8 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
}
case AArch64::STPXi:
case AArch64::LDPXi: {
- unsigned Reg0 = MBBI->getOperand(0).getReg();
- unsigned Reg1 = MBBI->getOperand(1).getReg();
+ Register Reg0 = MBBI->getOperand(0).getReg();
+ Register Reg1 = MBBI->getOperand(1).getReg();
if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
.addImm(Imm * 8)
@@ -791,6 +830,10 @@ static bool needsWinCFI(const MachineFunction &MF) {
F.needsUnwindTableEntry();
}
+static bool isTargetDarwin(const MachineFunction &MF) {
+ return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin();
+}
+
void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -846,6 +889,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// Ideally it should match SP value after prologue.
AFI->setTaggedBasePointerOffset(MFI.getStackSize());
+ const StackOffset &SVEStackSize = getSVEStackSize(MF);
+
// getStackSize() includes all the locals in its size calculation. We don't
// include these locals when computing the stack size of a funclet, as they
// are allocated in the parent's stack frame and accessed via the frame
@@ -856,6 +901,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
: (int)MFI.getStackSize();
if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
assert(!HasFP && "unexpected function without stack frame but with FP");
+ assert(!SVEStackSize &&
+ "unexpected function without stack frame but with SVE objects");
// All of the stack allocation is for locals.
AFI->setLocalStackSize(NumBytes);
if (!NumBytes)
@@ -866,8 +913,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
AFI->setHasRedZone(true);
++NumRedZoneFunctions;
} else {
- emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+ {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
+ false, NeedsWinCFI, &HasWinCFI);
if (!NeedsWinCFI) {
// Label used to tie together the PROLOG_LABEL and the MachineMoves.
MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
@@ -901,8 +949,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
if (CombineSPBump) {
- emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ assert(!SVEStackSize && "Cannot combine SP bump with SVE");
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+ {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false,
+ NeedsWinCFI, &HasWinCFI);
NumBytes = 0;
} else if (PrologueSaveSize != 0) {
MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
@@ -948,9 +998,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
}
if (HasFP) {
- // Only set up FP if we actually need to. Frame pointer is fp =
- // sp - fixedobject - 16.
- int FPOffset = AFI->getCalleeSavedStackSize() - 16;
+ // Only set up FP if we actually need to.
+ int FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;
+
if (CombineSPBump)
FPOffset += AFI->getLocalStackSize();
@@ -958,8 +1008,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// mov fp,sp when FPOffset is zero.
// Note: All stores of callee-saved registers are marked as "FrameSetup".
// This code marks the instruction(s) that set the FP also.
- emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
+ {FPOffset, MVT::i8}, TII, MachineInstr::FrameSetup, false,
+ NeedsWinCFI, &HasWinCFI);
}
if (windowsRequiresStackProbe(MF, NumBytes)) {
@@ -1056,6 +1107,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
NumBytes = 0;
}
+ emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII,
+ MachineInstr::FrameSetup);
+
// Allocate space for the rest of the frame.
if (NumBytes) {
const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
@@ -1071,8 +1125,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
// the correct value here, as NumBytes also includes padding bytes,
// which shouldn't be counted here.
- emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
- MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
+ emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
+ {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
+ false, NeedsWinCFI, &HasWinCFI);
if (NeedsRealignment) {
const unsigned Alignment = MFI.getMaxAlignment();
@@ -1130,8 +1185,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (needsFrameMoves) {
const DataLayout &TD = MF.getDataLayout();
- const int StackGrowth = -TD.getPointerSize(0);
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ const int StackGrowth = isTargetDarwin(MF)
+ ? (2 * -TD.getPointerSize(0))
+ : -AFI->getCalleeSavedStackSize();
+ Register FramePtr = RegInfo->getFrameRegister(MF);
// An example of the prologue:
//
// .globl __foo
@@ -1202,7 +1259,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// Define the current CFA rule to use the provided FP.
unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
- nullptr, Reg, 2 * StackGrowth - FixedObject));
+ nullptr, Reg, StackGrowth - FixedObject));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
@@ -1401,11 +1458,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameDestroy);
}
+ const StackOffset &SVEStackSize = getSVEStackSize(MF);
+
// If there is a single SP update, insert it before the ret and we're done.
if (CombineSPBump) {
+ assert(!SVEStackSize && "Cannot combine SP bump with SVE");
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
- NumBytes + AfterCSRPopSize, TII, MachineInstr::FrameDestroy,
- false, NeedsWinCFI, &HasWinCFI);
+ {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
if (NeedsWinCFI && HasWinCFI)
BuildMI(MBB, MBB.getFirstTerminator(), DL,
TII->get(AArch64::SEH_EpilogEnd))
@@ -1416,6 +1476,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
+ // Deallocate the SVE area.
+ if (SVEStackSize)
+ if (!AFI->isStackRealigned())
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize,
+ TII, MachineInstr::FrameDestroy);
+
if (!hasFP(MF)) {
bool RedZone = canUseRedZone(MF);
// If this was a redzone leaf function, we don't need to restore the
@@ -1437,8 +1503,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
- StackRestoreBytes, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI, &HasWinCFI);
+ {StackRestoreBytes, MVT::i8}, TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
if (Done) {
if (NeedsWinCFI) {
HasWinCFI = true;
@@ -1456,13 +1522,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// FIXME: Rather than doing the math here, we should instead just use
// non-post-indexed loads for the restores if we aren't actually going to
// be able to save any instructions.
- if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned()))
+ if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
+ int64_t OffsetToFrameRecord =
+ isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0;
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
- -AFI->getCalleeSavedStackSize() + 16, TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI);
- else if (NumBytes)
- emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI);
+ {OffsetToFrameRecord, MVT::i8},
+ TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
+ } else if (NumBytes)
+ emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+ {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false,
+ NeedsWinCFI);
// This must be placed after the callee-save restore code because that code
// assumes the SP is at the same location as it was after the callee-save save
@@ -1483,8 +1552,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
- AfterCSRPopSize, TII, MachineInstr::FrameDestroy, false,
- NeedsWinCFI, &HasWinCFI);
+ {(int64_t)AfterCSRPopSize, MVT::i8}, TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
}
if (NeedsWinCFI && HasWinCFI)
BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
@@ -1501,10 +1570,11 @@ int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
int FI,
unsigned &FrameReg) const {
return resolveFrameIndexReference(
- MF, FI, FrameReg,
- /*PreferFP=*/
- MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
- /*ForSimm=*/false);
+ MF, FI, FrameReg,
+ /*PreferFP=*/
+ MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
+ /*ForSimm=*/false)
+ .getBytes();
}
int AArch64FrameLowering::getNonLocalFrameIndexReference(
@@ -1512,18 +1582,19 @@ int AArch64FrameLowering::getNonLocalFrameIndexReference(
return getSEHFrameIndexOffset(MF, FI);
}
-static int getFPOffset(const MachineFunction &MF, int ObjectOffset) {
+static StackOffset getFPOffset(const MachineFunction &MF, int ObjectOffset) {
const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
bool IsWin64 =
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
- return ObjectOffset + FixedObject + 16;
+ unsigned FPAdjust = isTargetDarwin(MF) ? 16 : AFI->getCalleeSavedStackSize();
+ return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
}
-static int getStackOffset(const MachineFunction &MF, int ObjectOffset) {
+static StackOffset getStackOffset(const MachineFunction &MF, int ObjectOffset) {
const auto &MFI = MF.getFrameInfo();
- return ObjectOffset + MFI.getStackSize();
+ return {ObjectOffset + (int)MFI.getStackSize(), MVT::i8};
}
int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
@@ -1532,23 +1603,23 @@ int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
MF.getSubtarget().getRegisterInfo());
int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
- ? getFPOffset(MF, ObjectOffset)
- : getStackOffset(MF, ObjectOffset);
+ ? getFPOffset(MF, ObjectOffset).getBytes()
+ : getStackOffset(MF, ObjectOffset).getBytes();
}
-int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
- int FI, unsigned &FrameReg,
- bool PreferFP,
- bool ForSimm) const {
+StackOffset AArch64FrameLowering::resolveFrameIndexReference(
+ const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP,
+ bool ForSimm) const {
const auto &MFI = MF.getFrameInfo();
int ObjectOffset = MFI.getObjectOffset(FI);
bool isFixed = MFI.isFixedObjectIndex(FI);
- return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, FrameReg,
+ bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector;
+ return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
PreferFP, ForSimm);
}
-int AArch64FrameLowering::resolveFrameOffsetReference(
- const MachineFunction &MF, int ObjectOffset, bool isFixed,
+StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
+ const MachineFunction &MF, int ObjectOffset, bool isFixed, bool isSVE,
unsigned &FrameReg, bool PreferFP, bool ForSimm) const {
const auto &MFI = MF.getFrameInfo();
const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
@@ -1556,17 +1627,23 @@ int AArch64FrameLowering::resolveFrameOffsetReference(
const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
- int FPOffset = getFPOffset(MF, ObjectOffset);
- int Offset = getStackOffset(MF, ObjectOffset);
+ int FPOffset = getFPOffset(MF, ObjectOffset).getBytes();
+ int Offset = getStackOffset(MF, ObjectOffset).getBytes();
bool isCSR =
!isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize());
+ const StackOffset &SVEStackSize = getSVEStackSize(MF);
+
// Use frame pointer to reference fixed objects. Use it for locals if
// there are VLAs or a dynamically realigned SP (and thus the SP isn't
// reliable as a base). Make sure useFPForScavengingIndex() does the
// right thing for the emergency spill slot.
bool UseFP = false;
- if (AFI->hasStackFrame()) {
+ if (AFI->hasStackFrame() && !isSVE) {
+ // We shouldn't prefer using the FP when there is an SVE area
+ // in between the FP and the non-SVE locals/spills.
+ PreferFP &= !SVEStackSize;
+
// Note: Keeping the following as multiple 'if' statements rather than
// merging to a single expression for readability.
//
@@ -1594,8 +1671,10 @@ int AArch64FrameLowering::resolveFrameOffsetReference(
bool CanUseBP = RegInfo->hasBasePointer(MF);
if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
UseFP = PreferFP;
- else if (!CanUseBP) // Can't use BP. Forced to use FP.
+ else if (!CanUseBP) { // Can't use BP. Forced to use FP.
+ assert(!SVEStackSize && "Expected BP to be available");
UseFP = true;
+ }
// else we can use BP and FP, but the offset from FP won't fit.
// That will make us scavenge registers which we can probably avoid by
// using BP. If it won't fit for BP either, we'll scavenge anyway.
@@ -1625,9 +1704,36 @@ int AArch64FrameLowering::resolveFrameOffsetReference(
"In the presence of dynamic stack pointer realignment, "
"non-argument/CSR objects cannot be accessed through the frame pointer");
+ if (isSVE) {
+ int64_t OffsetToSVEArea =
+ MFI.getStackSize() - AFI->getCalleeSavedStackSize();
+ StackOffset FPOffset = {ObjectOffset, MVT::nxv1i8};
+ StackOffset SPOffset = SVEStackSize +
+ StackOffset(ObjectOffset, MVT::nxv1i8) +
+ StackOffset(OffsetToSVEArea, MVT::i8);
+ // Always use the FP for SVE spills if available and beneficial.
+ if (hasFP(MF) &&
+ (SPOffset.getBytes() ||
+ FPOffset.getScalableBytes() < SPOffset.getScalableBytes() ||
+ RegInfo->needsStackRealignment(MF))) {
+ FrameReg = RegInfo->getFrameRegister(MF);
+ return FPOffset;
+ }
+
+ FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
+ : (unsigned)AArch64::SP;
+ return SPOffset;
+ }
+
+ StackOffset ScalableOffset = {};
+ if (UseFP && !(isFixed || isCSR))
+ ScalableOffset = -SVEStackSize;
+ if (!UseFP && (isFixed || isCSR))
+ ScalableOffset = SVEStackSize;
+
if (UseFP) {
FrameReg = RegInfo->getFrameRegister(MF);
- return FPOffset;
+ return StackOffset(FPOffset, MVT::i8) + ScalableOffset;
}
// Use the base pointer if we have one.
@@ -1644,7 +1750,7 @@ int AArch64FrameLowering::resolveFrameOffsetReference(
Offset -= AFI->getLocalStackSize();
}
- return Offset;
+ return StackOffset(Offset, MVT::i8) + ScalableOffset;
}
static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
@@ -1682,6 +1788,23 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
return true;
}
+/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
+/// WindowsCFI requires that only consecutive registers can be paired.
+/// LR and FP need to be allocated together when the frame needs to save
+/// the frame-record. This means any other register pairing with LR is invalid.
+static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
+ bool NeedsWinCFI, bool NeedsFrameRecord) {
+ if (NeedsWinCFI)
+ return invalidateWindowsRegisterPairing(Reg1, Reg2, true);
+
+ // If we need to store the frame record, don't pair any register
+ // with LR other than FP.
+ if (NeedsFrameRecord)
+ return Reg2 == AArch64::LR;
+
+ return false;
+}
+
namespace {
struct RegPairInfo {
@@ -1701,7 +1824,7 @@ struct RegPairInfo {
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
- bool &NeedShadowCallStackProlog) {
+ bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
if (CSI.empty())
return;
@@ -1743,7 +1866,8 @@ static void computeCalleeSaveRegisterPairs(
switch (RPI.Type) {
case RegPairInfo::GPR:
if (AArch64::GPR64RegClass.contains(NextReg) &&
- !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
+ !invalidateRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
+ NeedsFrameRecord))
RPI.Reg2 = NextReg;
break;
case RegPairInfo::FPR64:
@@ -1777,6 +1901,10 @@ static void computeCalleeSaveRegisterPairs(
(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
"Out of order callee saved regs!");
+ assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
+ RPI.Reg1 == AArch64::LR) &&
+ "FrameRecord must be allocated together with LR");
+
// MachO's compact unwind format relies on all registers being stored in
// adjacent register pairs.
assert((!produceCompactUnwindFrame(MF) ||
@@ -1825,7 +1953,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
bool NeedShadowCallStackProlog = false;
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
- NeedShadowCallStackProlog);
+ NeedShadowCallStackProlog, hasFP(MF));
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (NeedShadowCallStackProlog) {
@@ -1955,7 +2083,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
bool NeedShadowCallStackProlog = false;
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
- NeedShadowCallStackProlog);
+ NeedShadowCallStackProlog, hasFP(MF));
auto EmitMI = [&](const RegPairInfo &RPI) {
unsigned Reg1 = RPI.Reg1;
@@ -2113,19 +2241,26 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
SavedRegs.set(AArch64::LR);
}
- LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
+ LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
for (unsigned Reg
: SavedRegs.set_bits()) dbgs()
<< ' ' << printReg(Reg, RegInfo);
dbgs() << "\n";);
// If any callee-saved registers are used, the frame cannot be eliminated.
- bool CanEliminateFrame = SavedRegs.count() == 0;
+ unsigned MaxAlign = getStackAlignment();
+ int64_t SVEStackSize =
+ alignTo(determineSVEStackSize(MFI, MaxAlign), MaxAlign);
+ assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes");
+ bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
// The CSR spill slots have not been allocated yet, so estimateStackSize
// won't include them.
unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
- bool BigStack = (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
+
+ // Conservatively always assume BigStack when there are SVE spills.
+ bool BigStack = SVEStackSize ||
+ (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
AFI->setHasStackFrame(true);
@@ -2145,7 +2280,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// store the pair.
if (produceCompactUnwindFrame(MF))
SavedRegs.set(UnspilledCSGPRPaired);
- ExtraCSSpill = UnspilledCSGPRPaired;
+ ExtraCSSpill = UnspilledCSGPR;
}
// If we didn't find an extra callee-saved register to spill, create
@@ -2181,14 +2316,42 @@ bool AArch64FrameLowering::enableStackSlotScavenging(
return AFI->hasCalleeSaveStackFreeSpace();
}
+int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI,
+ unsigned &MaxAlign) const {
+ // Process all fixed stack objects.
+ int64_t Offset = 0;
+ for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
+ if (MFI.getStackID(I) == TargetStackID::SVEVector) {
+ int64_t FixedOffset = -MFI.getObjectOffset(I);
+ if (FixedOffset > Offset)
+ Offset = FixedOffset;
+ }
+
+ // Note: We don't take allocatable stack objects into
+ // account yet, because allocation for those is not yet
+ // implemented.
+ return Offset;
+}
+
void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
MachineFunction &MF, RegScavenger *RS) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
+ "Upwards growing stack unsupported");
+
+ unsigned MaxAlign = getStackAlignment();
+ int64_t SVEStackSize = determineSVEStackSize(MFI, MaxAlign);
+
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ AFI->setStackSizeSVE(alignTo(SVEStackSize, MaxAlign));
+ assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes");
+
// If this function isn't doing Win64-style C++ EH, we don't need to do
// anything.
if (!MF.hasEHFunclets())
return;
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- MachineFrameInfo &MFI = MF.getFrameInfo();
WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
MachineBasicBlock &MBB = MF.front();
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 6dbd34b2189f..ac150e86c9eb 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
+#include "AArch64StackOffset.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
@@ -20,7 +21,7 @@ namespace llvm {
class AArch64FrameLowering : public TargetFrameLowering {
public:
explicit AArch64FrameLowering()
- : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+ : TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16),
true /*StackRealignable*/) {}
void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
@@ -39,12 +40,13 @@ public:
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
- int resolveFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg, bool PreferFP,
- bool ForSimm) const;
- int resolveFrameOffsetReference(const MachineFunction &MF, int ObjectOffset,
- bool isFixed, unsigned &FrameReg,
- bool PreferFP, bool ForSimm) const;
+ StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg, bool PreferFP,
+ bool ForSimm) const;
+ StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
+ int ObjectOffset, bool isFixed,
+ bool isSVE, unsigned &FrameReg,
+ bool PreferFP, bool ForSimm) const;
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const std::vector<CalleeSavedInfo> &CSI,
@@ -85,9 +87,21 @@ public:
int FI) const override;
int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const;
+ bool isSupportedStackID(TargetStackID::Value ID) const override {
+ switch (ID) {
+ default:
+ return false;
+ case TargetStackID::Default:
+ case TargetStackID::SVEVector:
+ case TargetStackID::NoAlloc:
+ return true;
+ }
+ }
+
private:
bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
unsigned StackBumpBytes) const;
+ int64_t determineSVEStackSize(MachineFrameInfo &MF, unsigned &MaxAlign) const;
};
} // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index cd7e927ac80c..1f08505f37e7 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2053,7 +2053,7 @@ static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
}
static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
- if (Depth >= 6)
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
return;
// Initialize UsefulBits
if (!Depth) {
@@ -2913,49 +2913,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
break;
- case ISD::EXTRACT_VECTOR_ELT: {
- // Extracting lane zero is a special case where we can just use a plain
- // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
- // the rest of the compiler, especially the register allocator and copyi
- // propagation, to reason about, so is preferred when it's possible to
- // use it.
- ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
- // Bail and use the default Select() for non-zero lanes.
- if (LaneNode->getZExtValue() != 0)
- break;
- // If the element type is not the same as the result type, likewise
- // bail and use the default Select(), as there's more to do than just
- // a cross-class COPY. This catches extracts of i8 and i16 elements
- // since they will need an explicit zext.
- if (VT != Node->getOperand(0).getValueType().getVectorElementType())
- break;
- unsigned SubReg;
- switch (Node->getOperand(0)
- .getValueType()
- .getVectorElementType()
- .getSizeInBits()) {
- default:
- llvm_unreachable("Unexpected vector element type!");
- case 64:
- SubReg = AArch64::dsub;
- break;
- case 32:
- SubReg = AArch64::ssub;
- break;
- case 16:
- SubReg = AArch64::hsub;
- break;
- case 8:
- llvm_unreachable("unexpected zext-requiring extract element!");
- }
- SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
- Node->getOperand(0));
- LLVM_DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
- LLVM_DEBUG(Extract->dumpr(CurDAG));
- LLVM_DEBUG(dbgs() << "\n");
- ReplaceNode(Node, Extract.getNode());
- return;
- }
case ISD::Constant: {
// Materialize zero constants as copies from WZR/XZR. This allows
// the coalescer to propagate these into other instructions.
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7becc99fb5c7..2746117e8ee5 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23,6 +23,7 @@
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -161,6 +162,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addQRTypeForNEON(MVT::v8f16);
}
+ if (Subtarget->hasSVE()) {
+ // Add legal sve predicate types
+ addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
+ addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
+ addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
+ addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
+
+ // Add legal sve data types
+ addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
+
+ addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv1f32, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv1f64, &AArch64::ZPRRegClass);
+ addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
+ }
+
// Compute derived properties from the register classes
computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -283,7 +307,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// AArch64 lacks both left-rotate and popcount instructions.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
setOperationAction(ISD::ROTL, MVT::i64, Expand);
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
}
@@ -297,7 +321,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
}
@@ -606,6 +630,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
+ MaxLoadsPerMemcmpOptSize = 4;
+ MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
+ ? MaxLoadsPerMemcmpOptSize : 8;
+
setStackPointerRegisterToSaveRestore(AArch64::SP);
setSchedulingPreference(Sched::Hybrid);
@@ -613,10 +641,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
EnableExtLdPromotion = true;
// Set required alignment.
- setMinFunctionAlignment(2);
+ setMinFunctionAlignment(Align(4));
// Set preferred alignments.
- setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
- setPrefLoopAlignment(STI.getPrefLoopAlignment());
+ setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
+ setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
// Only change the limit for entries in a jump table if specified by
// the sub target, but not at the command line.
@@ -725,7 +753,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
// Likewise, narrowing and extending vector loads/stores aren't handled
// directly.
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
@@ -741,7 +769,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
- for (MVT InnerVT : MVT::vector_valuetypes()) {
+ for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
@@ -773,6 +801,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
}
+ if (Subtarget->hasSVE()) {
+ for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
+ if (isTypeLegal(VT) && VT.getVectorElementType() != MVT::i1)
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ }
+ }
+
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
}
@@ -1025,6 +1060,14 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
Known.One &= Known2.One;
break;
}
+ case AArch64ISD::LOADgot:
+ case AArch64ISD::ADDlow: {
+ if (!Subtarget->isTargetILP32())
+ break;
+ // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
+ Known.Zero = APInt::getHighBitsSet(64, 32);
+ break;
+ }
case ISD::INTRINSIC_W_CHAIN: {
ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
@@ -1100,6 +1143,32 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
return true;
}
+// Same as above but handling LLTs instead.
+bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
+ LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *Fast) const {
+ if (Subtarget->requiresStrictAlign())
+ return false;
+
+ if (Fast) {
+ // Some CPUs are fine with unaligned stores except for 128-bit ones.
+ *Fast = !Subtarget->isMisaligned128StoreSlow() ||
+ Ty.getSizeInBytes() != 16 ||
+ // See comments in performSTORECombine() for more details about
+ // these conditions.
+
+ // Code that uses clang vector extensions can mark that it
+ // wants unaligned accesses to be treated as fast by
+ // underspecifying alignment to be 1 or 2.
+ Align <= 2 ||
+
+ // Disregard v2i64. Memcpy lowering produces those and splitting
+ // them regresses performance on micro-benchmarks and olden/bh.
+ Ty == LLT::vector(2, 64);
+ }
+ return true;
+}
+
FastISel *
AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
@@ -1238,6 +1307,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::STZG: return "AArch64ISD::STZG";
case AArch64ISD::ST2G: return "AArch64ISD::ST2G";
case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G";
+ case AArch64ISD::SUNPKHI: return "AArch64ISD::SUNPKHI";
+ case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO";
+ case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI";
+ case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO";
}
return nullptr;
}
@@ -1263,9 +1336,9 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
DebugLoc DL = MI.getDebugLoc();
MachineFunction::iterator It = ++MBB->getIterator();
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned IfTrueReg = MI.getOperand(1).getReg();
- unsigned IfFalseReg = MI.getOperand(2).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
+ Register IfTrueReg = MI.getOperand(1).getReg();
+ Register IfFalseReg = MI.getOperand(2).getReg();
unsigned CondCode = MI.getOperand(3).getImm();
bool NZCVKilled = MI.getOperand(4).isKill();
@@ -2140,7 +2213,8 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
- return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
}
// Returns true if the given Op is the overflow flag result of an overflow
@@ -2349,7 +2423,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
// precise. That doesn't take part in the LibCall so we can't directly use
// LowerF128Call.
SDValue SrcVal = Op.getOperand(0);
- return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions,
SDLoc(Op)).first;
}
@@ -2419,7 +2494,8 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
- return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first;
}
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -2773,6 +2849,19 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_sve_sunpkhi:
+ return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_sunpklo:
+ return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_uunpkhi:
+ return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_uunpklo:
+ return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
+ Op.getOperand(1));
+
case Intrinsic::localaddress: {
const auto &MF = DAG.getMachineFunction();
const auto *RegInfo = Subtarget->getRegisterInfo();
@@ -2937,6 +3026,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::SPLAT_VECTOR:
+ return LowerSPLAT_VECTOR(Op, DAG);
case ISD::EXTRACT_SUBVECTOR:
return LowerEXTRACT_SUBVECTOR(Op, DAG);
case ISD::SRA:
@@ -3014,8 +3105,11 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
return CC_AArch64_Win64_VarArg;
if (!Subtarget->isTargetDarwin())
return CC_AArch64_AAPCS;
- return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
- case CallingConv::Win64:
+ if (!IsVarArg)
+ return CC_AArch64_DarwinPCS;
+ return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
+ : CC_AArch64_DarwinPCS_VarArg;
+ case CallingConv::Win64:
return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
case CallingConv::AArch64_VectorCall:
return CC_AArch64_AAPCS;
@@ -3038,6 +3132,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
+ DenseMap<unsigned, SDValue> CopiedRegs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
@@ -3094,11 +3189,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
continue;
}
+ SDValue ArgValue;
if (VA.isRegLoc()) {
// Arguments stored in registers.
EVT RegVT = VA.getLocVT();
-
- SDValue ArgValue;
const TargetRegisterClass *RC;
if (RegVT == MVT::i32)
@@ -3113,6 +3207,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
RC = &AArch64::FPR64RegClass;
else if (RegVT == MVT::f128 || RegVT.is128BitVector())
RC = &AArch64::FPR128RegClass;
+ else if (RegVT.isScalableVector() &&
+ RegVT.getVectorElementType() == MVT::i1)
+ RC = &AArch64::PPRRegClass;
+ else if (RegVT.isScalableVector())
+ RC = &AArch64::ZPRRegClass;
else
llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
@@ -3128,20 +3227,23 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
+ case CCValAssign::Indirect:
+ assert(VA.getValVT().isScalableVector() &&
+ "Only scalable vectors can be passed indirectly");
+ llvm_unreachable("Spilling of SVE vectors not yet implemented");
case CCValAssign::BCvt:
ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
break;
case CCValAssign::AExt:
case CCValAssign::SExt:
case CCValAssign::ZExt:
- // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
- // nodes after our lowering.
- assert(RegVT == Ins[i].VT && "incorrect register location selected");
+ break;
+ case CCValAssign::AExtUpper:
+ ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
+ DAG.getConstant(32, DL, RegVT));
+ ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
break;
}
-
- InVals.push_back(ArgValue);
-
} else { // VA.isRegLoc()
assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
unsigned ArgOffset = VA.getLocMemOffset();
@@ -3156,7 +3258,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
- SDValue ArgValue;
// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
@@ -3165,9 +3266,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
switch (VA.getLocInfo()) {
default:
break;
+ case CCValAssign::Trunc:
case CCValAssign::BCvt:
MemVT = VA.getLocVT();
break;
+ case CCValAssign::Indirect:
+ assert(VA.getValVT().isScalableVector() &&
+ "Only scalable vectors can be passed indirectly");
+ llvm_unreachable("Spilling of SVE vectors not yet implemented");
case CCValAssign::SExt:
ExtType = ISD::SEXTLOAD;
break;
@@ -3184,8 +3290,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
MemVT);
- InVals.push_back(ArgValue);
}
+ if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
+ ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
+ ArgValue, DAG.getValueType(MVT::i32));
+ InVals.push_back(ArgValue);
}
// varargs
@@ -3202,8 +3311,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
// This will point to the next argument passed via stack.
unsigned StackOffset = CCInfo.getNextStackOffset();
- // We currently pass all varargs at 8-byte alignment.
- StackOffset = ((StackOffset + 7) & ~7);
+ // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
+ StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
if (MFI.hasMustTailInVarArgFunc()) {
@@ -3233,8 +3342,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
assert(!FuncInfo->getSRetReturnReg());
MVT PtrTy = getPointerTy(DAG.getDataLayout());
- unsigned Reg =
- MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+ Register Reg =
+ MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
FuncInfo->setSRetReturnReg(Reg);
SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
@@ -3366,6 +3475,7 @@ SDValue AArch64TargetLowering::LowerCallResult(
: RetCC_AArch64_AAPCS;
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
+ DenseMap<unsigned, SDValue> CopiedRegs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC);
@@ -3383,10 +3493,16 @@ SDValue AArch64TargetLowering::LowerCallResult(
continue;
}
- SDValue Val =
- DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
- Chain = Val.getValue(1);
- InFlag = Val.getValue(2);
+ // Avoid copying a physreg twice since RegAllocFast is incompetent and only
+ // allows one use of a physreg per block.
+ SDValue Val = CopiedRegs.lookup(VA.getLocReg());
+ if (!Val) {
+ Val =
+ DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+ CopiedRegs[VA.getLocReg()] = Val;
+ }
switch (VA.getLocInfo()) {
default:
@@ -3396,6 +3512,15 @@ SDValue AArch64TargetLowering::LowerCallResult(
case CCValAssign::BCvt:
Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
break;
+ case CCValAssign::AExtUpper:
+ Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
+ DAG.getConstant(32, DL, VA.getLocVT()));
+ LLVM_FALLTHROUGH;
+ case CCValAssign::AExt:
+ LLVM_FALLTHROUGH;
+ case CCValAssign::ZExt:
+ Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
+ break;
}
InVals.push_back(Val);
@@ -3593,6 +3718,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool IsVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
+ MachineFunction::CallSiteInfo CSInfo;
bool IsThisReturn = false;
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -3709,6 +3835,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
getPointerTy(DAG.getDataLayout()));
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallSet<unsigned, 8> RegsUsed;
SmallVector<SDValue, 8> MemOpChains;
auto PtrVT = getPointerTy(DAG.getDataLayout());
@@ -3716,7 +3843,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
- RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+ RegsToPass.emplace_back(F.PReg, Val);
}
}
@@ -3747,12 +3874,25 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
}
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
break;
+ case CCValAssign::AExtUpper:
+ assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+ Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
+ DAG.getConstant(32, DL, VA.getLocVT()));
+ break;
case CCValAssign::BCvt:
- Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ Arg = DAG.getBitcast(VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::Trunc:
+ Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
break;
case CCValAssign::FPExt:
Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
break;
+ case CCValAssign::Indirect:
+ assert(VA.getValVT().isScalableVector() &&
+ "Only scalable vectors can be passed indirectly");
+ llvm_unreachable("Spilling of SVE vectors not yet implemented");
}
if (VA.isRegLoc()) {
@@ -3764,7 +3904,33 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
"unexpected use of 'returned'");
IsThisReturn = true;
}
- RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ if (RegsUsed.count(VA.getLocReg())) {
+ // If this register has already been used then we're trying to pack
+ // parts of an [N x i32] into an X-register. The extension type will
+ // take care of putting the two halves in the right place but we have to
+ // combine them.
+ SDValue &Bits =
+ std::find_if(RegsToPass.begin(), RegsToPass.end(),
+ [=](const std::pair<unsigned, SDValue> &Elt) {
+ return Elt.first == VA.getLocReg();
+ })
+ ->second;
+ Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
+ // Call site info is used for function's parameter entry value
+ // tracking. For now we track only simple cases when parameter
+ // is transferred through whole register.
+ CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(),
+ [&VA](MachineFunction::ArgRegPair ArgReg) {
+ return ArgReg.Reg == VA.getLocReg();
+ }),
+ CSInfo.end());
+ } else {
+ RegsToPass.emplace_back(VA.getLocReg(), Arg);
+ RegsUsed.insert(VA.getLocReg());
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.EnableDebugEntryValues)
+ CSInfo.emplace_back(VA.getLocReg(), i);
+ }
} else {
assert(VA.isMemLoc());
@@ -3899,6 +4065,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Ops.push_back(DAG.getRegister(RegToPass.first,
RegToPass.second.getValueType()));
+ // Check callee args/returns for SVE registers and set calling convention
+ // accordingly.
+ if (CallConv == CallingConv::C) {
+ bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
+ return Out.VT.isScalableVector();
+ });
+ bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
+ return In.VT.isScalableVector();
+ });
+
+ if (CalleeInSVE || CalleeOutSVE)
+ CallConv = CallingConv::AArch64_SVE_VectorCall;
+ }
+
// Add a register mask operand representing the call-preserved registers.
const uint32_t *Mask;
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
@@ -3930,12 +4110,15 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// actual call instruction.
if (IsTailCall) {
MF.getFrameInfo().setHasTailCall();
- return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
+ SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
+ DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+ return Ret;
}
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
InFlag = Chain.getValue(1);
+ DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
uint64_t CalleePopBytes =
DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
@@ -3983,7 +4166,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// Copy the result values into the output registers.
SDValue Flag;
- SmallVector<SDValue, 4> RetOps(1, Chain);
+ SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
+ SmallSet<unsigned, 4> RegsUsed;
for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
++i, ++realRVLocIdx) {
CCValAssign &VA = RVLocs[i];
@@ -4005,11 +4189,38 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
break;
+ case CCValAssign::AExt:
+ case CCValAssign::ZExt:
+ Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
+ break;
+ case CCValAssign::AExtUpper:
+ assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
+ Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
+ Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
+ DAG.getConstant(32, DL, VA.getLocVT()));
+ break;
}
- Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+ if (RegsUsed.count(VA.getLocReg())) {
+ SDValue &Bits =
+ std::find_if(RetVals.begin(), RetVals.end(),
+ [=](const std::pair<unsigned, SDValue> &Elt) {
+ return Elt.first == VA.getLocReg();
+ })
+ ->second;
+ Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
+ } else {
+ RetVals.emplace_back(VA.getLocReg(), Arg);
+ RegsUsed.insert(VA.getLocReg());
+ }
+ }
+
+ SmallVector<SDValue, 4> RetOps(1, Chain);
+ for (auto &RetVal : RetVals) {
+ Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
Flag = Chain.getValue(1);
- RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ RetOps.push_back(
+ DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
}
// Windows AArch64 ABIs require that for returning structs by value we copy
@@ -4139,8 +4350,7 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GN->getGlobal();
- unsigned char OpFlags =
- Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+ unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
if (OpFlags != AArch64II::MO_NO_FLAG)
assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
@@ -4204,6 +4414,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
SDLoc DL(Op);
MVT PtrVT = getPointerTy(DAG.getDataLayout());
+ MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
SDValue TLVPAddr =
@@ -4214,13 +4425,15 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
// to obtain the address of the variable.
SDValue Chain = DAG.getEntryNode();
SDValue FuncTLVGet = DAG.getLoad(
- MVT::i64, DL, Chain, DescAddr,
+ PtrMemVT, DL, Chain, DescAddr,
MachinePointerInfo::getGOT(DAG.getMachineFunction()),
- /* Alignment = */ 8,
- MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant |
- MachineMemOperand::MODereferenceable);
+ /* Alignment = */ PtrMemVT.getSizeInBits() / 8,
+ MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
Chain = FuncTLVGet.getValue(1);
+ // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
+ FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
+
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setAdjustsStack(true);
@@ -4470,7 +4683,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
// value of a libcall against zero, which is just what the rest of LowerBR_CC
// is expecting to deal with.
if (LHS.getValueType() == MVT::f128) {
- softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
@@ -4736,7 +4949,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// Handle f128 first, since one possible outcome is a normal integer
// comparison which gets picked up by the next if statement.
if (LHS.getValueType() == MVT::f128) {
- softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
// If softenSetCCOperands returned a scalar, use it.
if (!RHS.getNode()) {
@@ -4798,7 +5011,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
// Handle f128 first, because it will result in a comparison of some RTLIB
// call result against zero.
if (LHS.getValueType() == MVT::f128) {
- softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
@@ -5096,6 +5309,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
SDLoc DL(Op);
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
getPointerTy(DAG.getDataLayout()));
+ FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
MachinePointerInfo(SV));
@@ -5202,15 +5416,15 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
// pointer.
SDLoc DL(Op);
- unsigned VaListSize =
- Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32;
+ unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
+ unsigned VaListSize = (Subtarget->isTargetDarwin() ||
+ Subtarget->isTargetWindows()) ? PtrSize : 32;
const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
- return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
- Op.getOperand(2),
- DAG.getConstant(VaListSize, DL, MVT::i32),
- 8, false, false, false, MachinePointerInfo(DestSV),
+ return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize,
+ false, false, false, MachinePointerInfo(DestSV),
MachinePointerInfo(SrcSV));
}
@@ -5224,12 +5438,15 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue Addr = Op.getOperand(1);
unsigned Align = Op.getConstantOperandVal(3);
+ unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
auto PtrVT = getPointerTy(DAG.getDataLayout());
-
- SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
+ auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
+ SDValue VAList =
+ DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
Chain = VAList.getValue(1);
+ VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
- if (Align > 8) {
+ if (Align > MinSlotSize) {
assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(Align - 1, DL, PtrVT));
@@ -5238,14 +5455,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
}
Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
- uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
+ unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
// Scalar integer and FP values smaller than 64 bits are implicitly extended
// up to 64 bits. At the very least, we have to increase the striding of the
// vaargs list to match this, and for FP values we need to introduce
// FP_ROUND nodes as well.
if (VT.isInteger() && !VT.isVector())
- ArgSize = 8;
+ ArgSize = std::max(ArgSize, MinSlotSize);
bool NeedFPTrunc = false;
if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
ArgSize = 8;
@@ -5255,6 +5472,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
// Increment the pointer, VAList, to the next vaarg
SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(ArgSize, DL, PtrVT));
+ VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
+
// Store the incremented VAList to the legalized pointer
SDValue APStore =
DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
@@ -5284,10 +5503,15 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDValue FrameAddr =
- DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+ DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
while (Depth--)
FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
MachinePointerInfo());
+
+ if (Subtarget->isTargetILP32())
+ FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
+ DAG.getValueType(VT));
+
return FrameAddr;
}
@@ -5306,9 +5530,9 @@ SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const {
- unsigned Reg = MatchRegisterName(RegName);
+Register AArch64TargetLowering::
+getRegisterByName(const char* RegName, EVT VT, const MachineFunction &MF) const {
+ Register Reg = MatchRegisterName(RegName);
if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
@@ -5653,6 +5877,21 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
return "r";
}
+enum PredicateConstraint {
+ Upl,
+ Upa,
+ Invalid
+};
+
+static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
+ PredicateConstraint P = PredicateConstraint::Invalid;
+ if (Constraint == "Upa")
+ P = PredicateConstraint::Upa;
+ if (Constraint == "Upl")
+ P = PredicateConstraint::Upl;
+ return P;
+}
+
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
AArch64TargetLowering::ConstraintType
@@ -5661,19 +5900,30 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
switch (Constraint[0]) {
default:
break;
- case 'z':
- return C_Other;
case 'x':
case 'w':
+ case 'y':
return C_RegisterClass;
// An address with a single base register. Due to the way we
// currently handle addresses it is the same as 'r'.
case 'Q':
return C_Memory;
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'Y':
+ case 'Z':
+ return C_Immediate;
+ case 'z':
case 'S': // A symbolic address
return C_Other;
}
- }
+ } else if (parsePredicateConstraint(Constraint) !=
+ PredicateConstraint::Invalid)
+ return C_RegisterClass;
return TargetLowering::getConstraintType(Constraint);
}
@@ -5697,12 +5947,17 @@ AArch64TargetLowering::getSingleConstraintMatchWeight(
break;
case 'x':
case 'w':
+ case 'y':
if (type->isFloatingPointTy() || type->isVectorTy())
weight = CW_Register;
break;
case 'z':
weight = CW_Constant;
break;
+ case 'U':
+ if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
+ weight = CW_Register;
+ break;
}
return weight;
}
@@ -5719,6 +5974,8 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
case 'w':
if (!Subtarget->hasFPARMv8())
break;
+ if (VT.isScalableVector())
+ return std::make_pair(0U, &AArch64::ZPRRegClass);
if (VT.getSizeInBits() == 16)
return std::make_pair(0U, &AArch64::FPR16RegClass);
if (VT.getSizeInBits() == 32)
@@ -5733,9 +5990,25 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
case 'x':
if (!Subtarget->hasFPARMv8())
break;
+ if (VT.isScalableVector())
+ return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
if (VT.getSizeInBits() == 128)
return std::make_pair(0U, &AArch64::FPR128_loRegClass);
break;
+ case 'y':
+ if (!Subtarget->hasFPARMv8())
+ break;
+ if (VT.isScalableVector())
+ return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
+ break;
+ }
+ } else {
+ PredicateConstraint PC = parsePredicateConstraint(Constraint);
+ if (PC != PredicateConstraint::Invalid) {
+ assert(VT.isScalableVector());
+ bool restricted = (PC == PredicateConstraint::Upl);
+ return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
+ : std::make_pair(0U, &AArch64::PPRRegClass);
}
}
if (StringRef("{cc}").equals_lower(Constraint))
@@ -6279,6 +6552,8 @@ static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts % 2 != 0)
+ return false;
WhichResult = (M[0] == 0 ? 0 : 1);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned i = 0; i != NumElts; i += 2) {
@@ -6446,8 +6721,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
if (!isConcatMask(Mask, VT, SplitV0))
return SDValue();
- EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
+ EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
if (SplitV0) {
V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
DAG.getConstant(0, DL, MVT::i64));
@@ -6790,6 +7064,41 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
return GenerateTBL(Op, ShuffleMask, DAG);
}
+SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ EVT ElemVT = VT.getScalarType();
+
+ SDValue SplatVal = Op.getOperand(0);
+
+ // Extend input splat value where needed to fit into a GPR (32b or 64b only)
+ // FPRs don't have this restriction.
+ switch (ElemVT.getSimpleVT().SimpleTy) {
+ case MVT::i8:
+ case MVT::i16:
+ SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
+ break;
+ case MVT::i64:
+ SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
+ break;
+ case MVT::i32:
+ // Fine as is
+ break;
+ // TODO: we can support splats of i1s and float types, but haven't added
+ // patterns yet.
+ case MVT::i1:
+ case MVT::f16:
+ case MVT::f32:
+ case MVT::f64:
+ default:
+ llvm_unreachable("Unsupported SPLAT_VECTOR input operand type");
+ break;
+ }
+
+ return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
+}
+
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
APInt &UndefBits) {
EVT VT = BVN->getValueType(0);
@@ -8063,7 +8372,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
- Info.align = 0;
+ Info.align.reset();
// volatile loads with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOLoad;
return true;
@@ -8089,7 +8398,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
- Info.align = 0;
+ Info.align.reset();
// volatile stores with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOStore;
return true;
@@ -8101,7 +8410,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+ Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
}
@@ -8112,7 +8421,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
- Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+ Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
}
@@ -8122,7 +8431,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = 16;
+ Info.align = Align(16);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
case Intrinsic::aarch64_stlxp:
@@ -8131,7 +8440,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
- Info.align = 16;
+ Info.align = Align(16);
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
default:
@@ -8278,7 +8587,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
// Get the shift amount based on the scaling factor:
// log2(sizeof(IdxTy)) - log2(8).
uint64_t ShiftAmt =
- countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
+ countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
// Is the constant foldable in the shift of the addressing mode?
// I.e., shift amount is between 1 and 4 inclusive.
if (ShiftAmt == 0 || ShiftAmt > 4)
@@ -8739,6 +9048,39 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
return MVT::Other;
}
+LLT AArch64TargetLowering::getOptimalMemOpLLT(
+ uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const {
+ bool CanImplicitFloat =
+ !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
+ bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
+ bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
+ // Only use AdvSIMD to implement memset of 32-byte and above. It would have
+ // taken one instruction to materialize the v2i64 zero and one store (with
+ // restrictive addressing mode). Just do i64 stores.
+ bool IsSmallMemset = IsMemset && Size < 32;
+ auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
+ if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
+ return true;
+ bool Fast;
+ return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
+ &Fast) &&
+ Fast;
+ };
+
+ if (CanUseNEON && IsMemset && !IsSmallMemset &&
+ AlignmentIsAcceptable(MVT::v2i64, 16))
+ return LLT::vector(2, 64);
+ if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
+ return LLT::scalar(128);
+ if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
+ return LLT::scalar(64);
+ if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
+ return LLT::scalar(32);
+ return LLT();
+}
+
// 12-bit optionally shifted immediates are legal for adds.
bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
if (Immed == std::numeric_limits<int64_t>::min()) {
@@ -10065,6 +10407,14 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
Opcode = AArch64ISD::SQSHLU_I;
IsRightShift = false;
break;
+ case Intrinsic::aarch64_neon_sshl:
+ case Intrinsic::aarch64_neon_ushl:
+ // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
+ // left shift for positive shift amounts. Below, we only replace the current
+ // node with VSHL, if this condition is met.
+ Opcode = AArch64ISD::VSHL;
+ IsRightShift = false;
+ break;
}
if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
@@ -10151,6 +10501,8 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_neon_sqshlu:
case Intrinsic::aarch64_neon_srshl:
case Intrinsic::aarch64_neon_urshl:
+ case Intrinsic::aarch64_neon_sshl:
+ case Intrinsic::aarch64_neon_ushl:
return tryCombineShiftImm(IID, N, DAG);
case Intrinsic::aarch64_crc32b:
case Intrinsic::aarch64_crc32cb:
@@ -10482,10 +10834,10 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return ReplacedSplat;
SDLoc DL(S);
- unsigned NumElts = VT.getVectorNumElements() / 2;
+
// Split VT into two.
- EVT HalfVT =
- EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+ EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+ unsigned NumElts = HalfVT.getVectorNumElements();
SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
DAG.getConstant(0, DL, MVT::i64));
SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
@@ -10567,7 +10919,7 @@ static SDValue performPostLD1Combine(SDNode *N,
// are predecessors to each other or the Vector.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 16> Worklist;
- Visited.insert(N);
+ Visited.insert(Addr.getNode());
Worklist.push_back(User);
Worklist.push_back(LD);
Worklist.push_back(Vector.getNode());
@@ -11983,6 +12335,27 @@ bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
return Mask->getValue().isPowerOf2();
}
+bool AArch64TargetLowering::
+ shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+ unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+ SelectionDAG &DAG) const {
+ // Does baseline recommend not to perform the fold by default?
+ if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
+ return false;
+ // Else, if this is a vector shift, prefer 'shl'.
+ return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
+}
+
+bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
+ SDNode *N) const {
+ if (DAG.getMachineFunction().getFunction().hasMinSize() &&
+ !Subtarget->isTargetWindows())
+ return false;
+ return true;
+}
+
void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
// Update IsSplitCSR in AArch64unctionInfo.
AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
@@ -12009,7 +12382,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
- unsigned NewVR = MRI->createVirtualRegister(RC);
+ Register NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 4421c31f65c9..00fa96bc4e6d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -191,6 +191,11 @@ enum NodeType : unsigned {
FRECPE, FRECPS,
FRSQRTE, FRSQRTS,
+ SUNPKHI,
+ SUNPKLO,
+ UUNPKHI,
+ UUNPKLO,
+
// NEON Load/Store with post-increment base updates
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
LD3post,
@@ -261,6 +266,14 @@ public:
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
+ MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const override {
+ // Returning i64 unconditionally here (i.e. even for ILP32) means that the
+ // *DAG* representation of pointers will always be 64-bits. They will be
+ // truncated and extended when transferred to memory, but the 64-bit DAG
+ // allows us to use AArch64's addressing modes much more easily.
+ return MVT::getIntegerVT(64);
+ }
+
bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
TargetLoweringOpt &TLO) const override;
@@ -272,6 +285,10 @@ public:
EVT VT, unsigned AddrSpace = 0, unsigned Align = 1,
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *Fast = nullptr) const override;
+ /// LLT variant.
+ bool allowsMisalignedMemoryAccesses(
+ LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *Fast = nullptr) const override;
/// Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -358,6 +375,10 @@ public:
bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
const AttributeList &FuncAttributes) const override;
+ LLT getOptimalMemOpLLT(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+ const AttributeList &FuncAttributes) const override;
+
/// Return true if the addressing mode represented by AM is legal for this
/// target, for a load/store of the specified type.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
@@ -480,11 +501,12 @@ public:
return VT.getSizeInBits() >= 64; // vector 'bic'
}
- bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
- if (DAG.getMachineFunction().getFunction().hasMinSize())
- return false;
- return true;
- }
+ bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+ unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+ SelectionDAG &DAG) const override;
+
+ bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
bool shouldTransformSignedTruncationCheck(EVT XVT,
unsigned KeptBits) const override {
@@ -655,6 +677,7 @@ private:
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
@@ -690,8 +713,8 @@ private:
unsigned combineRepeatedFPDivisors() const override;
ConstraintType getConstraintType(StringRef Constraint) const override;
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
/// Examine constraint string and operand type and determine a weight value.
/// The operand object must already have been set up with the operand type.
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index e22cb44d81ae..459b53923625 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -204,19 +204,27 @@ def : Pat<(relaxed_store<atomic_store_64>
def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }];
+}
def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }];
+}
def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }];
+}
def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }];
+}
def : Pat<(ldxr_1 GPR64sp:$addr),
(SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
@@ -237,19 +245,27 @@ def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff),
def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }];
+}
def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }];
+}
def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }];
+}
def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }];
+}
def : Pat<(ldaxr_1 GPR64sp:$addr),
(SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
@@ -271,22 +287,30 @@ def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff),
def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }];
+}
def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }];
+}
def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }];
+}
def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }];
+}
def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr),
@@ -317,22 +341,30 @@ def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stlxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }];
+}
def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stlxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }];
+}
def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stlxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }];
+}
def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
(int_aarch64_stlxr node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
+}]> {
+ let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }];
+}
def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr),
@@ -422,4 +454,3 @@ let Predicates = [HasLSE] in {
defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">;
defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">;
}
-
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index d619137b55c5..f555e4123307 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -480,76 +480,40 @@ def BranchTarget14Operand : BranchTarget<14>;
def BranchTarget26Operand : BranchTarget<26>;
def PCRelLabel19Operand : PCRelLabel<19>;
-def MovZSymbolG3AsmOperand : AsmOperandClass {
- let Name = "MovZSymbolG3";
+def MovWSymbolG3AsmOperand : AsmOperandClass {
+ let Name = "MovWSymbolG3";
let RenderMethod = "addImmOperands";
}
-def movz_symbol_g3 : Operand<i32> {
- let ParserMatchClass = MovZSymbolG3AsmOperand;
+def movw_symbol_g3 : Operand<i32> {
+ let ParserMatchClass = MovWSymbolG3AsmOperand;
}
-def MovZSymbolG2AsmOperand : AsmOperandClass {
- let Name = "MovZSymbolG2";
+def MovWSymbolG2AsmOperand : AsmOperandClass {
+ let Name = "MovWSymbolG2";
let RenderMethod = "addImmOperands";
}
-def movz_symbol_g2 : Operand<i32> {
- let ParserMatchClass = MovZSymbolG2AsmOperand;
+def movw_symbol_g2 : Operand<i32> {
+ let ParserMatchClass = MovWSymbolG2AsmOperand;
}
-def MovZSymbolG1AsmOperand : AsmOperandClass {
- let Name = "MovZSymbolG1";
+def MovWSymbolG1AsmOperand : AsmOperandClass {
+ let Name = "MovWSymbolG1";
let RenderMethod = "addImmOperands";
}
-def movz_symbol_g1 : Operand<i32> {
- let ParserMatchClass = MovZSymbolG1AsmOperand;
+def movw_symbol_g1 : Operand<i32> {
+ let ParserMatchClass = MovWSymbolG1AsmOperand;
}
-def MovZSymbolG0AsmOperand : AsmOperandClass {
- let Name = "MovZSymbolG0";
+def MovWSymbolG0AsmOperand : AsmOperandClass {
+ let Name = "MovWSymbolG0";
let RenderMethod = "addImmOperands";
}
-def movz_symbol_g0 : Operand<i32> {
- let ParserMatchClass = MovZSymbolG0AsmOperand;
-}
-
-def MovKSymbolG3AsmOperand : AsmOperandClass {
- let Name = "MovKSymbolG3";
- let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g3 : Operand<i32> {
- let ParserMatchClass = MovKSymbolG3AsmOperand;
-}
-
-def MovKSymbolG2AsmOperand : AsmOperandClass {
- let Name = "MovKSymbolG2";
- let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g2 : Operand<i32> {
- let ParserMatchClass = MovKSymbolG2AsmOperand;
-}
-
-def MovKSymbolG1AsmOperand : AsmOperandClass {
- let Name = "MovKSymbolG1";
- let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g1 : Operand<i32> {
- let ParserMatchClass = MovKSymbolG1AsmOperand;
-}
-
-def MovKSymbolG0AsmOperand : AsmOperandClass {
- let Name = "MovKSymbolG0";
- let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g0 : Operand<i32> {
- let ParserMatchClass = MovKSymbolG0AsmOperand;
+def movw_symbol_g0 : Operand<i32> {
+ let ParserMatchClass = MovWSymbolG0AsmOperand;
}
class fixedpoint_i32<ValueType FloatVT>
@@ -673,6 +637,11 @@ def logical_imm64_XFORM : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
}]>;
+def gi_logical_imm32_XFORM : GICustomOperandRenderer<"renderLogicalImm32">,
+ GISDNodeXFormEquiv<logical_imm32_XFORM>;
+def gi_logical_imm64_XFORM : GICustomOperandRenderer<"renderLogicalImm64">,
+ GISDNodeXFormEquiv<logical_imm64_XFORM>;
+
let DiagnosticType = "LogicalSecondSource" in {
def LogicalImm32Operand : AsmOperandClass {
let Name = "LogicalImm32";
@@ -714,12 +683,15 @@ def logical_imm64_not : Operand<i64> {
let ParserMatchClass = LogicalImm64NotOperand;
}
-// imm0_65535 predicate - True if the immediate is in the range [0,65535].
-def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+// iXX_imm0_65535 predicates - True if the immediate is in the range [0,65535].
+let ParserMatchClass = AsmImmRange<0, 65535>, PrintMethod = "printImmHex" in {
+def i32_imm0_65535 : Operand<i32>, TImmLeaf<i32, [{
return ((uint32_t)Imm) < 65536;
-}]> {
- let ParserMatchClass = AsmImmRange<0, 65535>;
- let PrintMethod = "printImmHex";
+}]>;
+
+def i64_imm0_65535 : Operand<i64>, TImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 65536;
+}]>;
}
// imm0_255 predicate - True if the immediate is in the range [0,255].
@@ -815,6 +787,14 @@ class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
+def gi_arith_shifted_reg32 :
+ GIComplexOperandMatcher<s32, "selectArithShiftedRegister">,
+ GIComplexPatternEquiv<arith_shifted_reg32>;
+
+def gi_arith_shifted_reg64 :
+ GIComplexOperandMatcher<s64, "selectArithShiftedRegister">,
+ GIComplexPatternEquiv<arith_shifted_reg64>;
+
// An arithmetic shifter operand:
// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
// {5-0} - imm6
@@ -837,6 +817,14 @@ class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
+def gi_logical_shifted_reg32 :
+ GIComplexOperandMatcher<s32, "selectLogicalShiftedRegister">,
+ GIComplexPatternEquiv<logical_shifted_reg32>;
+
+def gi_logical_shifted_reg64 :
+ GIComplexOperandMatcher<s64, "selectLogicalShiftedRegister">,
+ GIComplexPatternEquiv<logical_shifted_reg64>;
+
// A logical vector shifter operand:
// {7-6} - shift type: 00 = lsl
// {5-0} - imm6: #0, #8, #16, or #24
@@ -918,6 +906,14 @@ class neg_addsub_shifted_imm<ValueType Ty>
def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
+def gi_neg_addsub_shifted_imm32 :
+ GIComplexOperandMatcher<s32, "selectNegArithImmed">,
+ GIComplexPatternEquiv<neg_addsub_shifted_imm32>;
+
+def gi_neg_addsub_shifted_imm64 :
+ GIComplexOperandMatcher<s64, "selectNegArithImmed">,
+ GIComplexPatternEquiv<neg_addsub_shifted_imm64>;
+
// An extend operand:
// {5-3} - extend type
// {2-0} - imm3
@@ -948,6 +944,21 @@ class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
let MIOperandInfo = (ops GPR32, arith_extend64);
}
+def arith_extended_reg32_i32 : arith_extended_reg32<i32>;
+def gi_arith_extended_reg32_i32 :
+ GIComplexOperandMatcher<s32, "selectArithExtendedRegister">,
+ GIComplexPatternEquiv<arith_extended_reg32_i32>;
+
+def arith_extended_reg32_i64 : arith_extended_reg32<i64>;
+def gi_arith_extended_reg32_i64 :
+ GIComplexOperandMatcher<s64, "selectArithExtendedRegister">,
+ GIComplexPatternEquiv<arith_extended_reg32_i64>;
+
+def arith_extended_reg32to64_i64 : arith_extended_reg32to64<i64>;
+def gi_arith_extended_reg32to64_i64 :
+ GIComplexOperandMatcher<s64, "selectArithExtendedRegister">,
+ GIComplexPatternEquiv<arith_extended_reg32to64_i64>;
+
// Floating-point immediate.
def fpimm16 : Operand<f16>,
FPImmLeaf<f16, [{
@@ -1000,8 +1011,8 @@ class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass {
let RenderMethod = "addVectorIndexOperands";
}
-class AsmVectorIndexOpnd<AsmOperandClass mc, code pred>
- : Operand<i64>, ImmLeaf<i64, pred> {
+class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc, code pred>
+ : Operand<ty>, ImmLeaf<ty, pred> {
let ParserMatchClass = mc;
let PrintMethod = "printVectorIndex";
}
@@ -1012,11 +1023,17 @@ def VectorIndexHOperand : AsmVectorIndex<0, 7>;
def VectorIndexSOperand : AsmVectorIndex<0, 3>;
def VectorIndexDOperand : AsmVectorIndex<0, 1>;
-def VectorIndex1 : AsmVectorIndexOpnd<VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
-def VectorIndexB : AsmVectorIndexOpnd<VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
-def VectorIndexH : AsmVectorIndexOpnd<VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
-def VectorIndexS : AsmVectorIndexOpnd<VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
-def VectorIndexD : AsmVectorIndexOpnd<VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
+def VectorIndex1 : AsmVectorIndexOpnd<i64, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
+def VectorIndexB : AsmVectorIndexOpnd<i64, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+def VectorIndexH : AsmVectorIndexOpnd<i64, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+def VectorIndexS : AsmVectorIndexOpnd<i64, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+def VectorIndexD : AsmVectorIndexOpnd<i64, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
+
+def VectorIndex132b : AsmVectorIndexOpnd<i32, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
+def VectorIndexB32b : AsmVectorIndexOpnd<i32, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+def VectorIndexH32b : AsmVectorIndexOpnd<i32, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+def VectorIndexS32b : AsmVectorIndexOpnd<i32, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+def VectorIndexD32b : AsmVectorIndexOpnd<i32, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">;
def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">;
@@ -1025,15 +1042,15 @@ def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">;
def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">;
def sve_elm_idx_extdup_b
- : AsmVectorIndexOpnd<SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>;
+ : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>;
def sve_elm_idx_extdup_h
- : AsmVectorIndexOpnd<SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>;
+ : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>;
def sve_elm_idx_extdup_s
- : AsmVectorIndexOpnd<SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>;
+ : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>;
def sve_elm_idx_extdup_d
- : AsmVectorIndexOpnd<SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>;
+ : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>;
def sve_elm_idx_extdup_q
- : AsmVectorIndexOpnd<SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+ : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>;
// 8-bit immediate for AdvSIMD where 64-bit values of the form:
// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
@@ -1082,6 +1099,45 @@ class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
let Inst{4-0} = Rt;
}
+// System instructions for transactional memory extension
+class TMBaseSystemI<bit L, bits<4> CRm, bits<3> op2, dag oops, dag iops,
+ string asm, string operands, list<dag> pattern>
+ : BaseSystemI<L, oops, iops, asm, operands, pattern>,
+ Sched<[WriteSys]> {
+ let Inst{20-12} = 0b000110011;
+ let Inst{11-8} = CRm;
+ let Inst{7-5} = op2;
+ let DecoderMethod = "";
+
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+// System instructions for transactional memory - single input operand
+class TMSystemI<bits<4> CRm, string asm, list<dag> pattern>
+ : TMBaseSystemI<0b1, CRm, 0b011,
+ (outs GPR64:$Rt), (ins), asm, "\t$Rt", pattern> {
+ bits<5> Rt;
+ let Inst{4-0} = Rt;
+}
+
+// System instructions for transactional memory - no operand
+class TMSystemINoOperand<bits<4> CRm, string asm, list<dag> pattern>
+ : TMBaseSystemI<0b0, CRm, 0b011, (outs), (ins), asm, "", pattern> {
+ let Inst{4-0} = 0b11111;
+}
+
+// System instructions for exit from transactions
+class TMSystemException<bits<3> op1, string asm, list<dag> pattern>
+ : I<(outs), (ins i64_imm0_65535:$imm), asm, "\t$imm", "", pattern>,
+ Sched<[WriteSys]> {
+ bits<16> imm;
+ let Inst{31-24} = 0b11010100;
+ let Inst{23-21} = op1;
+ let Inst{20-5} = imm;
+ let Inst{4-0} = 0b00000;
+}
+
// Hint instructions that take both a CRm and a 3-bit immediate.
// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
// model patterns with sufficiently fine granularity
@@ -2180,11 +2236,11 @@ multiclass AddSub<bit isSub, string mnemonic, string alias,
// Add/Subtract extended register
let AddedComplexity = 1, hasSideEffects = 0 in {
def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
- arith_extended_reg32<i32>, mnemonic, OpNode> {
+ arith_extended_reg32_i32, mnemonic, OpNode> {
let Inst{31} = 0;
}
def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
- arith_extended_reg32to64<i64>, mnemonic, OpNode> {
+ arith_extended_reg32to64_i64, mnemonic, OpNode> {
let Inst{31} = 1;
}
}
@@ -2254,11 +2310,11 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
// Add/Subtract extended register
let AddedComplexity = 1 in {
def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
- arith_extended_reg32<i32>, mnemonic, OpNode> {
+ arith_extended_reg32_i32, mnemonic, OpNode> {
let Inst{31} = 0;
}
def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
- arith_extended_reg32<i64>, mnemonic, OpNode> {
+ arith_extended_reg32_i64, mnemonic, OpNode> {
let Inst{31} = 1;
}
}
@@ -2969,6 +3025,22 @@ def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
+def gi_ro_Xindexed8 :
+ GIComplexOperandMatcher<s64, "selectAddrModeXRO<8>">,
+ GIComplexPatternEquiv<ro_Xindexed8>;
+def gi_ro_Xindexed16 :
+ GIComplexOperandMatcher<s64, "selectAddrModeXRO<16>">,
+ GIComplexPatternEquiv<ro_Xindexed16>;
+def gi_ro_Xindexed32 :
+ GIComplexOperandMatcher<s64, "selectAddrModeXRO<32>">,
+ GIComplexPatternEquiv<ro_Xindexed32>;
+def gi_ro_Xindexed64 :
+ GIComplexOperandMatcher<s64, "selectAddrModeXRO<64>">,
+ GIComplexPatternEquiv<ro_Xindexed64>;
+def gi_ro_Xindexed128 :
+ GIComplexOperandMatcher<s64, "selectAddrModeXRO<128>">,
+ GIComplexPatternEquiv<ro_Xindexed128>;
+
def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
@@ -4086,7 +4158,7 @@ multiclass MemTagStore<bits<2> opc1, string insn> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
- : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
+ : I<(outs), (ins i32_imm0_65535:$imm), asm, "\t$imm", "", []>,
Sched<[WriteSys]> {
bits<16> imm;
let Inst{31-24} = 0b11010100;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 215e96a82d0e..5c35e5bcdd30 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -32,6 +32,7 @@
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/Casting.h"
@@ -82,6 +83,10 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
}
+ // Meta-instructions emit no code.
+ if (MI.isMetaInstruction())
+ return 0;
+
// FIXME: We currently only handle pseudoinstructions that don't get expanded
// before the assembly printer.
unsigned NumBytes = 0;
@@ -91,12 +96,6 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
// Anything not explicitly designated otherwise is a normal 4-byte insn.
NumBytes = 4;
break;
- case TargetOpcode::DBG_VALUE:
- case TargetOpcode::EH_LABEL:
- case TargetOpcode::IMPLICIT_DEF:
- case TargetOpcode::KILL:
- NumBytes = 0;
- break;
case TargetOpcode::STACKMAP:
// The upper bound for a stackmap intrinsic is the full length of its shadow
NumBytes = StackMapOpers(&MI).getNumPatchBytes();
@@ -416,7 +415,7 @@ unsigned AArch64InstrInfo::insertBranch(
// Find the original register that VReg is copied from.
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
- while (TargetRegisterInfo::isVirtualRegister(VReg)) {
+ while (Register::isVirtualRegister(VReg)) {
const MachineInstr *DefMI = MRI.getVRegDef(VReg);
if (!DefMI->isFullCopy())
return VReg;
@@ -431,7 +430,7 @@ static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
unsigned *NewVReg = nullptr) {
VReg = removeCopies(MRI, VReg);
- if (!TargetRegisterInfo::isVirtualRegister(VReg))
+ if (!Register::isVirtualRegister(VReg))
return 0;
bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
@@ -574,7 +573,7 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
CC = AArch64CC::NE;
break;
}
- unsigned SrcReg = Cond[2].getReg();
+ Register SrcReg = Cond[2].getReg();
if (Is64Bit) {
// cmp reg, #0 is actually subs xzr, reg, #0.
MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
@@ -930,7 +929,7 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
}
bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
- const MachineInstr &MIa, const MachineInstr &MIb, AliasAnalysis *AA) const {
+ const MachineInstr &MIa, const MachineInstr &MIb) const {
const TargetRegisterInfo *TRI = &getRegisterInfo();
const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
int64_t OffsetA = 0, OffsetB = 0;
@@ -1071,8 +1070,8 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) {
assert(MO.isReg() &&
"Operand has register constraints without being a register!");
- unsigned Reg = MO.getReg();
- if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ Register Reg = MO.getReg();
+ if (Register::isPhysicalRegister(Reg)) {
if (!OpRegCstraints->contains(Reg))
return false;
} else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
@@ -1472,6 +1471,8 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return false;
MachineBasicBlock &MBB = *MI.getParent();
+ auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
+ auto TRI = Subtarget.getRegisterInfo();
DebugLoc DL = MI.getDebugLoc();
if (MI.getOpcode() == AArch64::CATCHRET) {
@@ -1497,21 +1498,32 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
const GlobalValue *GV =
cast<GlobalValue>((*MI.memoperands_begin())->getValue());
const TargetMachine &TM = MBB.getParent()->getTarget();
- unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
+ unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
const unsigned char MO_NC = AArch64II::MO_NC;
if ((OpFlags & AArch64II::MO_GOT) != 0) {
BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
.addGlobalAddress(GV, 0, OpFlags);
- BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
- .addReg(Reg, RegState::Kill)
- .addImm(0)
- .addMemOperand(*MI.memoperands_begin());
+ if (Subtarget.isTargetILP32()) {
+ unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
+ BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
+ .addDef(Reg32, RegState::Dead)
+ .addUse(Reg, RegState::Kill)
+ .addImm(0)
+ .addMemOperand(*MI.memoperands_begin())
+ .addDef(Reg, RegState::Implicit);
+ } else {
+ BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addImm(0)
+ .addMemOperand(*MI.memoperands_begin());
+ }
} else if (TM.getCodeModel() == CodeModel::Large) {
+ assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
.addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
.addImm(0);
@@ -1538,10 +1550,20 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
.addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
- BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
- .addReg(Reg, RegState::Kill)
- .addGlobalAddress(GV, 0, LoFlags)
- .addMemOperand(*MI.memoperands_begin());
+ if (Subtarget.isTargetILP32()) {
+ unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
+ BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
+ .addDef(Reg32, RegState::Dead)
+ .addUse(Reg, RegState::Kill)
+ .addGlobalAddress(GV, 0, LoFlags)
+ .addMemOperand(*MI.memoperands_begin())
+ .addDef(Reg, RegState::Implicit);
+ } else {
+ BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addGlobalAddress(GV, 0, LoFlags)
+ .addMemOperand(*MI.memoperands_begin());
+ }
}
MBB.erase(MI);
@@ -1581,7 +1603,7 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
break;
case TargetOpcode::COPY: {
// GPR32 copies will by lowered to ORRXrs
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
return (AArch64::GPR32RegClass.contains(DstReg) ||
AArch64::GPR64RegClass.contains(DstReg));
}
@@ -1611,7 +1633,7 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
break;
case TargetOpcode::COPY: {
// FPR64 copies will by lowered to ORR.16b
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
return (AArch64::FPR64RegClass.contains(DstReg) ||
AArch64::FPR128RegClass.contains(DstReg));
}
@@ -1917,7 +1939,7 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
// e.g., ldr x0, [x0]
// This case will never occur with an FI base.
if (MI.getOperand(1).isReg()) {
- unsigned BaseReg = MI.getOperand(1).getReg();
+ Register BaseReg = MI.getOperand(1).getReg();
const TargetRegisterInfo *TRI = &getRegisterInfo();
if (MI.modifiesRegister(BaseReg, TRI))
return false;
@@ -1928,6 +1950,17 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
if (isLdStPairSuppressed(MI))
return false;
+ // Do not pair any callee-save store/reload instructions in the
+ // prologue/epilogue if the CFI information encoded the operations as separate
+ // instructions, as that will cause the size of the actual prologue to mismatch
+ // with the prologue size recorded in the Windows CFI.
+ const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
+ bool NeedsWinCFI = MAI->usesWindowsCFI() &&
+ MI.getMF()->getFunction().needsUnwindTableEntry();
+ if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
+ MI.getFlag(MachineInstr::FrameDestroy)))
+ return false;
+
// On some CPUs quad load/store pairs are slower than two single load/stores.
if (Subtarget.isPaired128Slow()) {
switch (MI.getOpcode()) {
@@ -2165,6 +2198,18 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
MinOffset = -256;
MaxOffset = 255;
break;
+ case AArch64::LDR_PXI:
+ case AArch64::STR_PXI:
+ Scale = Width = 2;
+ MinOffset = -256;
+ MaxOffset = 255;
+ break;
+ case AArch64::LDR_ZXI:
+ case AArch64::STR_ZXI:
+ Scale = Width = 16;
+ MinOffset = -256;
+ MaxOffset = 255;
+ break;
case AArch64::ST2GOffset:
case AArch64::STZ2GOffset:
Scale = 16;
@@ -2350,7 +2395,7 @@ static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
if (!SubIdx)
return MIB.addReg(Reg, State);
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
return MIB.addReg(Reg, State, SubIdx);
}
@@ -2474,6 +2519,27 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ // Copy a Predicate register by ORRing with itself.
+ if (AArch64::PPRRegClass.contains(DestReg) &&
+ AArch64::PPRRegClass.contains(SrcReg)) {
+ assert(Subtarget.hasSVE() && "Unexpected SVE register.");
+ BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
+ .addReg(SrcReg) // Pg
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ // Copy a Z register by ORRing with itself.
+ if (AArch64::ZPRRegClass.contains(DestReg) &&
+ AArch64::ZPRRegClass.contains(SrcReg)) {
+ assert(Subtarget.hasSVE() && "Unexpected SVE register.");
+ BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
+ .addReg(SrcReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
if (AArch64::GPR64spRegClass.contains(DestReg) &&
(AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
@@ -2722,7 +2788,7 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
MachineMemOperand *MMO) {
unsigned SrcReg0 = SrcReg;
unsigned SrcReg1 = SrcReg;
- if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ if (Register::isPhysicalRegister(SrcReg)) {
SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
SubIdx0 = 0;
SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
@@ -2761,7 +2827,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
case 4:
if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
Opc = AArch64::STRWui;
- if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+ if (Register::isVirtualRegister(SrcReg))
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
else
assert(SrcReg != AArch64::WSP);
@@ -2771,7 +2837,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
case 8:
if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
Opc = AArch64::STRXui;
- if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+ if (Register::isVirtualRegister(SrcReg))
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
else
assert(SrcReg != AArch64::SP);
@@ -2852,7 +2918,7 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
unsigned DestReg0 = DestReg;
unsigned DestReg1 = DestReg;
bool IsUndef = true;
- if (TargetRegisterInfo::isPhysicalRegister(DestReg)) {
+ if (Register::isPhysicalRegister(DestReg)) {
DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
SubIdx0 = 0;
DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
@@ -2892,7 +2958,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
case 4:
if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
Opc = AArch64::LDRWui;
- if (TargetRegisterInfo::isVirtualRegister(DestReg))
+ if (Register::isVirtualRegister(DestReg))
MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
else
assert(DestReg != AArch64::WSP);
@@ -2902,7 +2968,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
case 8:
if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
Opc = AArch64::LDRXui;
- if (TargetRegisterInfo::isVirtualRegister(DestReg))
+ if (Register::isVirtualRegister(DestReg))
MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
else
assert(DestReg != AArch64::SP);
@@ -2972,21 +3038,39 @@ void AArch64InstrInfo::loadRegFromStackSlot(
MI.addMemOperand(MMO);
}
-void llvm::emitFrameOffset(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
- unsigned DestReg, unsigned SrcReg, int Offset,
- const TargetInstrInfo *TII,
- MachineInstr::MIFlag Flag, bool SetNZCV,
- bool NeedsWinCFI, bool *HasWinCFI) {
- if (DestReg == SrcReg && Offset == 0)
- return;
-
- assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
- "SP increment/decrement not 16-byte aligned");
-
- bool isSub = Offset < 0;
- if (isSub)
- Offset = -Offset;
+// Helper function to emit a frame offset adjustment from a given
+// pointer (SrcReg), stored into DestReg. This function is explicit
+// in that it requires the opcode.
+static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, unsigned DestReg,
+ unsigned SrcReg, int64_t Offset, unsigned Opc,
+ const TargetInstrInfo *TII,
+ MachineInstr::MIFlag Flag, bool NeedsWinCFI,
+ bool *HasWinCFI) {
+ int Sign = 1;
+ unsigned MaxEncoding, ShiftSize;
+ switch (Opc) {
+ case AArch64::ADDXri:
+ case AArch64::ADDSXri:
+ case AArch64::SUBXri:
+ case AArch64::SUBSXri:
+ MaxEncoding = 0xfff;
+ ShiftSize = 12;
+ break;
+ case AArch64::ADDVL_XXI:
+ case AArch64::ADDPL_XXI:
+ MaxEncoding = 31;
+ ShiftSize = 0;
+ if (Offset < 0) {
+ MaxEncoding = 32;
+ Sign = -1;
+ Offset = -Offset;
+ }
+ break;
+ default:
+ llvm_unreachable("Unsupported opcode");
+ }
// FIXME: If the offset won't fit in 24-bits, compute the offset into a
// scratch register. If DestReg is a virtual register, use it as the
@@ -2999,65 +3083,94 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
// of code.
// assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
- unsigned Opc;
- if (SetNZCV)
- Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
- else
- Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
- const unsigned MaxEncoding = 0xfff;
- const unsigned ShiftSize = 12;
const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
- while (((unsigned)Offset) >= (1 << ShiftSize)) {
- unsigned ThisVal;
- if (((unsigned)Offset) > MaxEncodableValue) {
- ThisVal = MaxEncodableValue;
- } else {
- ThisVal = Offset & MaxEncodableValue;
+ do {
+ unsigned ThisVal = std::min<unsigned>(Offset, MaxEncodableValue);
+ unsigned LocalShiftSize = 0;
+ if (ThisVal > MaxEncoding) {
+ ThisVal = ThisVal >> ShiftSize;
+ LocalShiftSize = ShiftSize;
}
assert((ThisVal >> ShiftSize) <= MaxEncoding &&
"Encoding cannot handle value that big");
- BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
- .addReg(SrcReg)
- .addImm(ThisVal >> ShiftSize)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
- .setMIFlag(Flag);
-
- if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP) {
+ auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+ .addReg(SrcReg)
+ .addImm(Sign * (int)ThisVal);
+ if (ShiftSize)
+ MBI = MBI.addImm(
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
+ MBI = MBI.setMIFlag(Flag);
+
+ if (NeedsWinCFI) {
+ assert(Sign == 1 && "SEH directives should always have a positive sign");
+ int Imm = (int)(ThisVal << LocalShiftSize);
+ if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
+ (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
+ if (HasWinCFI)
+ *HasWinCFI = true;
+ if (Imm == 0)
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
+ else
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
+ .addImm(Imm)
+ .setMIFlag(Flag);
+ assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to "
+ "emit a single SEH directive");
+ } else if (DestReg == AArch64::SP) {
+ if (HasWinCFI)
+ *HasWinCFI = true;
+ assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+ .addImm(Imm)
+ .setMIFlag(Flag);
+ }
if (HasWinCFI)
*HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
- .addImm(ThisVal)
- .setMIFlag(Flag);
}
SrcReg = DestReg;
- Offset -= ThisVal;
- if (Offset == 0)
- return;
- }
- BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
- .addReg(SrcReg)
- .addImm(Offset)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
- .setMIFlag(Flag);
+ Offset -= ThisVal << LocalShiftSize;
+ } while (Offset);
+}
- if (NeedsWinCFI) {
- if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
- (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
- if (HasWinCFI)
- *HasWinCFI = true;
- if (Offset == 0)
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).
- setMIFlag(Flag);
- else
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)).
- addImm(Offset).setMIFlag(Flag);
- } else if (DestReg == AArch64::SP) {
- if (HasWinCFI)
- *HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)).
- addImm(Offset).setMIFlag(Flag);
+void llvm::emitFrameOffset(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ unsigned DestReg, unsigned SrcReg,
+ StackOffset Offset, const TargetInstrInfo *TII,
+ MachineInstr::MIFlag Flag, bool SetNZCV,
+ bool NeedsWinCFI, bool *HasWinCFI) {
+ int64_t Bytes, NumPredicateVectors, NumDataVectors;
+ Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors);
+
+ // First emit non-scalable frame offsets, or a simple 'mov'.
+ if (Bytes || (!Offset && SrcReg != DestReg)) {
+ assert((DestReg != AArch64::SP || Bytes % 16 == 0) &&
+ "SP increment/decrement not 16-byte aligned");
+ unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
+ if (Bytes < 0) {
+ Bytes = -Bytes;
+ Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
}
+ emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
+ NeedsWinCFI, HasWinCFI);
+ SrcReg = DestReg;
+ }
+
+ assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
+ "SetNZCV not supported with SVE vectors");
+ assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
+ "WinCFI not supported with SVE vectors");
+
+ if (NumDataVectors) {
+ emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
+ AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
+ SrcReg = DestReg;
+ }
+
+ if (NumPredicateVectors) {
+ assert(DestReg != AArch64::SP && "Unaligned access to SP");
+ emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
+ AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
}
}
@@ -3079,15 +3192,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
// <rdar://problem/11522048>
//
if (MI.isFullCopy()) {
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
- if (SrcReg == AArch64::SP &&
- TargetRegisterInfo::isVirtualRegister(DstReg)) {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
return nullptr;
}
- if (DstReg == AArch64::SP &&
- TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
return nullptr;
}
@@ -3127,14 +3238,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
MachineBasicBlock &MBB = *MI.getParent();
const MachineOperand &DstMO = MI.getOperand(0);
const MachineOperand &SrcMO = MI.getOperand(1);
- unsigned DstReg = DstMO.getReg();
- unsigned SrcReg = SrcMO.getReg();
+ Register DstReg = DstMO.getReg();
+ Register SrcReg = SrcMO.getReg();
// This is slightly expensive to compute for physical regs since
// getMinimalPhysRegClass is slow.
auto getRegClass = [&](unsigned Reg) {
- return TargetRegisterInfo::isVirtualRegister(Reg)
- ? MRI.getRegClass(Reg)
- : TRI.getMinimalPhysRegClass(Reg);
+ return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
+ : TRI.getMinimalPhysRegClass(Reg);
};
if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
@@ -3159,8 +3269,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
//
// STRXui %xzr, %stack.0
//
- if (IsSpill && DstMO.isUndef() &&
- TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
assert(SrcMO.getSubReg() == 0 &&
"Unexpected subreg on physical register");
const TargetRegisterClass *SpillRC;
@@ -3243,10 +3352,23 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
return nullptr;
}
-int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+static bool isSVEScaledImmInstruction(unsigned Opcode) {
+ switch (Opcode) {
+ case AArch64::LDR_ZXI:
+ case AArch64::STR_ZXI:
+ case AArch64::LDR_PXI:
+ case AArch64::STR_PXI:
+ return true;
+ default:
+ return false;
+ }
+}
+
+int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
+ StackOffset &SOffset,
bool *OutUseUnscaledOp,
unsigned *OutUnscaledOp,
- int *EmittableOffset) {
+ int64_t *EmittableOffset) {
// Set output values in case of early exit.
if (EmittableOffset)
*EmittableOffset = 0;
@@ -3285,6 +3407,10 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
// Construct the complete offset.
+ bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode());
+ int64_t Offset =
+ IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes());
+
const MachineOperand &ImmOpnd =
MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
Offset += ImmOpnd.getImm() * Scale;
@@ -3304,7 +3430,7 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
"Cannot have remainder when using unscaled op");
assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
- int NewOffset = Offset / Scale;
+ int64_t NewOffset = Offset / Scale;
if (MinOff <= NewOffset && NewOffset <= MaxOff)
Offset = Remainder;
else {
@@ -3319,27 +3445,33 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
if (OutUnscaledOp && UnscaledOp)
*OutUnscaledOp = *UnscaledOp;
+ if (IsMulVL)
+ SOffset = StackOffset(Offset, MVT::nxv1i8) +
+ StackOffset(SOffset.getBytes(), MVT::i8);
+ else
+ SOffset = StackOffset(Offset, MVT::i8) +
+ StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8);
return AArch64FrameOffsetCanUpdate |
- (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
+ (SOffset ? 0 : AArch64FrameOffsetIsLegal);
}
bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
- unsigned FrameReg, int &Offset,
+ unsigned FrameReg, StackOffset &Offset,
const AArch64InstrInfo *TII) {
unsigned Opcode = MI.getOpcode();
unsigned ImmIdx = FrameRegIdx + 1;
if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
- Offset += MI.getOperand(ImmIdx).getImm();
+ Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8);
emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
MI.getOperand(0).getReg(), FrameReg, Offset, TII,
MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
MI.eraseFromParent();
- Offset = 0;
+ Offset = StackOffset();
return true;
}
- int NewOffset;
+ int64_t NewOffset;
unsigned UnscaledOp;
bool UseUnscaledOp;
int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
@@ -3352,7 +3484,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
MI.setDesc(TII->get(UnscaledOp));
MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
- return Offset == 0;
+ return !Offset;
}
return false;
@@ -3428,13 +3560,19 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
switch (Inst.getOpcode()) {
default:
break;
+ case AArch64::FADDHrr:
case AArch64::FADDSrr:
case AArch64::FADDDrr:
+ case AArch64::FADDv4f16:
+ case AArch64::FADDv8f16:
case AArch64::FADDv2f32:
case AArch64::FADDv2f64:
case AArch64::FADDv4f32:
+ case AArch64::FSUBHrr:
case AArch64::FSUBSrr:
case AArch64::FSUBDrr:
+ case AArch64::FSUBv4f16:
+ case AArch64::FSUBv8f16:
case AArch64::FSUBv2f32:
case AArch64::FSUBv2f64:
case AArch64::FSUBv4f32:
@@ -3459,7 +3597,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineInstr *MI = nullptr;
- if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
MI = MRI.getUniqueVRegDef(MO.getReg());
// And it needs to be in the trace (otherwise, it won't have a depth).
if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
@@ -3544,86 +3682,48 @@ static bool getMaddPatterns(MachineInstr &Root,
Opc = NewOpc;
}
+ auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
+ MachineCombinerPattern Pattern) {
+ if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
+ Patterns.push_back(Pattern);
+ Found = true;
+ }
+ };
+
+ typedef MachineCombinerPattern MCP;
+
switch (Opc) {
default:
break;
case AArch64::ADDWrr:
assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
"ADDWrr does not have register operands");
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
- AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MULADDW_OP1);
- Found = true;
- }
- if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
- AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MULADDW_OP2);
- Found = true;
- }
+ setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
+ setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
break;
case AArch64::ADDXrr:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
- AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MULADDX_OP1);
- Found = true;
- }
- if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
- AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MULADDX_OP2);
- Found = true;
- }
+ setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
+ setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
break;
case AArch64::SUBWrr:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
- AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1);
- Found = true;
- }
- if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
- AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2);
- Found = true;
- }
+ setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
+ setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
break;
case AArch64::SUBXrr:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
- AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1);
- Found = true;
- }
- if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
- AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2);
- Found = true;
- }
+ setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
+ setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
break;
case AArch64::ADDWri:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
- AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1);
- Found = true;
- }
+ setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
break;
case AArch64::ADDXri:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
- AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1);
- Found = true;
- }
+ setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
break;
case AArch64::SUBWri:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
- AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1);
- Found = true;
- }
+ setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
break;
case AArch64::SUBXri:
- if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
- AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1);
- Found = true;
- }
+ setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
break;
}
return Found;
@@ -3640,204 +3740,135 @@ static bool getFMAPatterns(MachineInstr &Root,
MachineBasicBlock &MBB = *Root.getParent();
bool Found = false;
+ auto Match = [&](int Opcode, int Operand,
+ MachineCombinerPattern Pattern) -> bool {
+ if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
+ Patterns.push_back(Pattern);
+ return true;
+ }
+ return false;
+ };
+
+ typedef MachineCombinerPattern MCP;
+
switch (Root.getOpcode()) {
default:
assert(false && "Unsupported FP instruction in combiner\n");
break;
+ case AArch64::FADDHrr:
+ assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+ "FADDHrr does not have register operands");
+
+ Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
+ Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
+ break;
case AArch64::FADDSrr:
assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
- "FADDWrr does not have register operands");
- if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv1i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv1i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
- Found = true;
- }
+ "FADDSrr does not have register operands");
+
+ Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
+ Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
+
+ Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
+ Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
break;
case AArch64::FADDDrr:
- if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv1i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv1i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
- Found = true;
- }
+ Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
+ Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
+
+ Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
+ Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
+ break;
+ case AArch64::FADDv4f16:
+ Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
+ Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
+
+ Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
+ Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
+ break;
+ case AArch64::FADDv8f16:
+ Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
+ Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
+
+ Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
+ Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
break;
case AArch64::FADDv2f32:
- if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
- Found = true;
- }
+ Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
+ Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
+
+ Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
+ Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
break;
case AArch64::FADDv2f64:
- if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2f64)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2f64)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
- Found = true;
- }
+ Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
+ Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
+
+ Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
+ Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
break;
case AArch64::FADDv4f32:
- if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv4i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv4f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv4i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv4f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
- Found = true;
- }
- break;
+ Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
+ Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
+ Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
+ Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
+ break;
+ case AArch64::FSUBHrr:
+ Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
+ Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
+ Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
+ break;
case AArch64::FSUBSrr:
- if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv1i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
- Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
- Found = true;
- }
+ Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
+
+ Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
+ Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
+
+ Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
break;
case AArch64::FSUBDrr:
- if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
- Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv1i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
- Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
- Found = true;
- }
+ Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
+
+ Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
+ Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
+
+ Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
+ break;
+ case AArch64::FSUBv4f16:
+ Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
+ Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
+
+ Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
+ Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
+ break;
+ case AArch64::FSUBv8f16:
+ Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
+ Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
+
+ Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
+ Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
break;
case AArch64::FSUBv2f32:
- if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1);
- Found = true;
- }
+ Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
+ Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
+
+ Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
+ Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
break;
case AArch64::FSUBv2f64:
- if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv2f64)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2i64_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv2f64)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1);
- Found = true;
- }
+ Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
+ Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
+
+ Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
+ Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
break;
case AArch64::FSUBv4f32:
- if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv4i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
- AArch64::FMULv4f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
- Found = true;
- }
- if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv4i32_indexed)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1);
- Found = true;
- } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
- AArch64::FMULv4f32)) {
- Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1);
- Found = true;
- }
+ Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
+ Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
+
+ Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
+ Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
break;
}
return Found;
@@ -3851,6 +3882,10 @@ bool AArch64InstrInfo::isThroughputPattern(
switch (Pattern) {
default:
break;
+ case MachineCombinerPattern::FMULADDH_OP1:
+ case MachineCombinerPattern::FMULADDH_OP2:
+ case MachineCombinerPattern::FMULSUBH_OP1:
+ case MachineCombinerPattern::FMULSUBH_OP2:
case MachineCombinerPattern::FMULADDS_OP1:
case MachineCombinerPattern::FMULADDS_OP2:
case MachineCombinerPattern::FMULSUBS_OP1:
@@ -3859,12 +3894,21 @@ bool AArch64InstrInfo::isThroughputPattern(
case MachineCombinerPattern::FMULADDD_OP2:
case MachineCombinerPattern::FMULSUBD_OP1:
case MachineCombinerPattern::FMULSUBD_OP2:
+ case MachineCombinerPattern::FNMULSUBH_OP1:
case MachineCombinerPattern::FNMULSUBS_OP1:
case MachineCombinerPattern::FNMULSUBD_OP1:
+ case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
+ case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
+ case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
+ case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+ case MachineCombinerPattern::FMLAv4f16_OP2:
+ case MachineCombinerPattern::FMLAv4f16_OP1:
+ case MachineCombinerPattern::FMLAv8f16_OP1:
+ case MachineCombinerPattern::FMLAv8f16_OP2:
case MachineCombinerPattern::FMLAv2f32_OP2:
case MachineCombinerPattern::FMLAv2f32_OP1:
case MachineCombinerPattern::FMLAv2f64_OP1:
@@ -3877,10 +3921,18 @@ bool AArch64InstrInfo::isThroughputPattern(
case MachineCombinerPattern::FMLAv4f32_OP2:
case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
+ case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
+ case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
+ case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+ case MachineCombinerPattern::FMLSv4f16_OP1:
+ case MachineCombinerPattern::FMLSv4f16_OP2:
+ case MachineCombinerPattern::FMLSv8f16_OP1:
+ case MachineCombinerPattern::FMLSv8f16_OP2:
case MachineCombinerPattern::FMLSv2f32_OP2:
case MachineCombinerPattern::FMLSv2f64_OP2:
case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
@@ -3933,15 +3985,15 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
unsigned MaddOpc, const TargetRegisterClass *RC,
FMAInstKind kind = FMAInstKind::Default,
- const unsigned *ReplacedAddend = nullptr) {
+ const Register *ReplacedAddend = nullptr) {
assert(IdxMulOpd == 1 || IdxMulOpd == 2);
unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
- unsigned ResultReg = Root.getOperand(0).getReg();
- unsigned SrcReg0 = MUL->getOperand(1).getReg();
+ Register ResultReg = Root.getOperand(0).getReg();
+ Register SrcReg0 = MUL->getOperand(1).getReg();
bool Src0IsKill = MUL->getOperand(1).isKill();
- unsigned SrcReg1 = MUL->getOperand(2).getReg();
+ Register SrcReg1 = MUL->getOperand(2).getReg();
bool Src1IsKill = MUL->getOperand(2).isKill();
unsigned SrcReg2;
@@ -3955,13 +4007,13 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
}
- if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+ if (Register::isVirtualRegister(ResultReg))
MRI.constrainRegClass(ResultReg, RC);
- if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+ if (Register::isVirtualRegister(SrcReg0))
MRI.constrainRegClass(SrcReg0, RC);
- if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+ if (Register::isVirtualRegister(SrcReg1))
MRI.constrainRegClass(SrcReg1, RC);
- if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
+ if (Register::isVirtualRegister(SrcReg2))
MRI.constrainRegClass(SrcReg2, RC);
MachineInstrBuilder MIB;
@@ -4015,19 +4067,19 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
assert(IdxMulOpd == 1 || IdxMulOpd == 2);
MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
- unsigned ResultReg = Root.getOperand(0).getReg();
- unsigned SrcReg0 = MUL->getOperand(1).getReg();
+ Register ResultReg = Root.getOperand(0).getReg();
+ Register SrcReg0 = MUL->getOperand(1).getReg();
bool Src0IsKill = MUL->getOperand(1).isKill();
- unsigned SrcReg1 = MUL->getOperand(2).getReg();
+ Register SrcReg1 = MUL->getOperand(2).getReg();
bool Src1IsKill = MUL->getOperand(2).isKill();
- if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+ if (Register::isVirtualRegister(ResultReg))
MRI.constrainRegClass(ResultReg, RC);
- if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+ if (Register::isVirtualRegister(SrcReg0))
MRI.constrainRegClass(SrcReg0, RC);
- if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+ if (Register::isVirtualRegister(SrcReg1))
MRI.constrainRegClass(SrcReg1, RC);
- if (TargetRegisterInfo::isVirtualRegister(VR))
+ if (Register::isVirtualRegister(VR))
MRI.constrainRegClass(VR, RC);
MachineInstrBuilder MIB =
@@ -4116,7 +4168,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
}
- unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+ Register NewVR = MRI.createVirtualRegister(OrrRC);
uint64_t Imm = Root.getOperand(2).getImm();
if (Root.getOperand(3).isImm()) {
@@ -4158,7 +4210,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
}
- unsigned NewVR = MRI.createVirtualRegister(SubRC);
+ Register NewVR = MRI.createVirtualRegister(SubRC);
// SUB NewVR, 0, C
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
@@ -4208,7 +4260,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
}
- unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+ Register NewVR = MRI.createVirtualRegister(OrrRC);
uint64_t Imm = Root.getOperand(2).getImm();
if (Root.getOperand(3).isImm()) {
unsigned Val = Root.getOperand(3).getImm();
@@ -4228,34 +4280,35 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
break;
}
// Floating Point Support
+ case MachineCombinerPattern::FMULADDH_OP1:
+ Opc = AArch64::FMADDHrrr;
+ RC = &AArch64::FPR16RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
case MachineCombinerPattern::FMULADDS_OP1:
+ Opc = AArch64::FMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
case MachineCombinerPattern::FMULADDD_OP1:
- // MUL I=A,B,0
- // ADD R,I,C
- // ==> MADD R,A,B,C
- // --- Create(MADD);
- if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
- Opc = AArch64::FMADDSrrr;
- RC = &AArch64::FPR32RegClass;
- } else {
- Opc = AArch64::FMADDDrrr;
- RC = &AArch64::FPR64RegClass;
- }
+ Opc = AArch64::FMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
+
+ case MachineCombinerPattern::FMULADDH_OP2:
+ Opc = AArch64::FMADDHrrr;
+ RC = &AArch64::FPR16RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
case MachineCombinerPattern::FMULADDS_OP2:
+ Opc = AArch64::FMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
case MachineCombinerPattern::FMULADDD_OP2:
- // FMUL I=A,B,0
- // FADD R,C,I
- // ==> FMADD R,A,B,C
- // --- Create(FMADD);
- if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
- Opc = AArch64::FMADDSrrr;
- RC = &AArch64::FPR32RegClass;
- } else {
- Opc = AArch64::FMADDDrrr;
- RC = &AArch64::FPR64RegClass;
- }
+ Opc = AArch64::FMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
@@ -4285,6 +4338,31 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
FMAInstKind::Indexed);
break;
+ case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FMLAv4i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv4f16_OP1:
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FMLAv4f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ break;
+ case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FMLAv4i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv4f16_OP2:
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FMLAv4f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ break;
+
case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
case MachineCombinerPattern::FMLAv2f32_OP1:
RC = &AArch64::FPR64RegClass;
@@ -4312,6 +4390,31 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
break;
+ case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
+ RC = &AArch64::FPR128RegClass;
+ Opc = AArch64::FMLAv8i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv8f16_OP1:
+ RC = &AArch64::FPR128RegClass;
+ Opc = AArch64::FMLAv8f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ break;
+ case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
+ RC = &AArch64::FPR128RegClass;
+ Opc = AArch64::FMLAv8i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv8f16_OP2:
+ RC = &AArch64::FPR128RegClass;
+ Opc = AArch64::FMLAv8f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ break;
+
case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
case MachineCombinerPattern::FMLAv2f64_OP1:
RC = &AArch64::FPR128RegClass;
@@ -4367,56 +4470,53 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
break;
+ case MachineCombinerPattern::FMULSUBH_OP1:
+ Opc = AArch64::FNMSUBHrrr;
+ RC = &AArch64::FPR16RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
case MachineCombinerPattern::FMULSUBS_OP1:
- case MachineCombinerPattern::FMULSUBD_OP1: {
- // FMUL I=A,B,0
- // FSUB R,I,C
- // ==> FNMSUB R,A,B,C // = -C + A*B
- // --- Create(FNMSUB);
- if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
- Opc = AArch64::FNMSUBSrrr;
- RC = &AArch64::FPR32RegClass;
- } else {
- Opc = AArch64::FNMSUBDrrr;
- RC = &AArch64::FPR64RegClass;
- }
+ Opc = AArch64::FNMSUBSrrr;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::FMULSUBD_OP1:
+ Opc = AArch64::FNMSUBDrrr;
+ RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
- }
+ case MachineCombinerPattern::FNMULSUBH_OP1:
+ Opc = AArch64::FNMADDHrrr;
+ RC = &AArch64::FPR16RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
case MachineCombinerPattern::FNMULSUBS_OP1:
- case MachineCombinerPattern::FNMULSUBD_OP1: {
- // FNMUL I=A,B,0
- // FSUB R,I,C
- // ==> FNMADD R,A,B,C // = -A*B - C
- // --- Create(FNMADD);
- if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
- Opc = AArch64::FNMADDSrrr;
- RC = &AArch64::FPR32RegClass;
- } else {
- Opc = AArch64::FNMADDDrrr;
- RC = &AArch64::FPR64RegClass;
- }
+ Opc = AArch64::FNMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::FNMULSUBD_OP1:
+ Opc = AArch64::FNMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
- }
+ case MachineCombinerPattern::FMULSUBH_OP2:
+ Opc = AArch64::FMSUBHrrr;
+ RC = &AArch64::FPR16RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
case MachineCombinerPattern::FMULSUBS_OP2:
- case MachineCombinerPattern::FMULSUBD_OP2: {
- // FMUL I=A,B,0
- // FSUB R,C,I
- // ==> FMSUB R,A,B,C (computes C - A*B)
- // --- Create(FMSUB);
- if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
- Opc = AArch64::FMSUBSrrr;
- RC = &AArch64::FPR32RegClass;
- } else {
- Opc = AArch64::FMSUBDrrr;
- RC = &AArch64::FPR64RegClass;
- }
+ Opc = AArch64::FMSUBSrrr;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::FMULSUBD_OP2:
+ Opc = AArch64::FMSUBDrrr;
+ RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
- }
case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
Opc = AArch64::FMLSv1i32_indexed;
@@ -4432,6 +4532,39 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
FMAInstKind::Indexed);
break;
+ case MachineCombinerPattern::FMLSv4f16_OP1:
+ case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
+ RC = &AArch64::FPR64RegClass;
+ Register NewVR = MRI.createVirtualRegister(RC);
+ MachineInstrBuilder MIB1 =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR)
+ .add(Root.getOperand(2));
+ InsInstrs.push_back(MIB1);
+ InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+ if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
+ Opc = AArch64::FMLAv4f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator, &NewVR);
+ } else {
+ Opc = AArch64::FMLAv4i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed, &NewVR);
+ }
+ break;
+ }
+ case MachineCombinerPattern::FMLSv4f16_OP2:
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FMLSv4f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ break;
+ case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
+ RC = &AArch64::FPR64RegClass;
+ Opc = AArch64::FMLSv4i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
case MachineCombinerPattern::FMLSv2f32_OP2:
case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
RC = &AArch64::FPR64RegClass;
@@ -4446,6 +4579,39 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
break;
+ case MachineCombinerPattern::FMLSv8f16_OP1:
+ case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
+ RC = &AArch64::FPR128RegClass;
+ Register NewVR = MRI.createVirtualRegister(RC);
+ MachineInstrBuilder MIB1 =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR)
+ .add(Root.getOperand(2));
+ InsInstrs.push_back(MIB1);
+ InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+ if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
+ Opc = AArch64::FMLAv8f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator, &NewVR);
+ } else {
+ Opc = AArch64::FMLAv8i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed, &NewVR);
+ }
+ break;
+ }
+ case MachineCombinerPattern::FMLSv8f16_OP2:
+ RC = &AArch64::FPR128RegClass;
+ Opc = AArch64::FMLSv8f16;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ break;
+ case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
+ RC = &AArch64::FPR128RegClass;
+ Opc = AArch64::FMLSv8i16_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
case MachineCombinerPattern::FMLSv2f64_OP2:
case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
RC = &AArch64::FPR128RegClass;
@@ -4476,7 +4642,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
case MachineCombinerPattern::FMLSv2f32_OP1:
case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
RC = &AArch64::FPR64RegClass;
- unsigned NewVR = MRI.createVirtualRegister(RC);
+ Register NewVR = MRI.createVirtualRegister(RC);
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
.add(Root.getOperand(2));
@@ -4496,7 +4662,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
case MachineCombinerPattern::FMLSv4f32_OP1:
case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
RC = &AArch64::FPR128RegClass;
- unsigned NewVR = MRI.createVirtualRegister(RC);
+ Register NewVR = MRI.createVirtualRegister(RC);
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
.add(Root.getOperand(2));
@@ -4516,7 +4682,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
case MachineCombinerPattern::FMLSv2f64_OP1:
case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
RC = &AArch64::FPR128RegClass;
- unsigned NewVR = MRI.createVirtualRegister(RC);
+ Register NewVR = MRI.createVirtualRegister(RC);
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
.add(Root.getOperand(2));
@@ -4617,15 +4783,15 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
- unsigned VReg = MI.getOperand(0).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(VReg))
+ Register VReg = MI.getOperand(0).getReg();
+ if (!Register::isVirtualRegister(VReg))
return false;
MachineInstr *DefMI = MRI->getVRegDef(VReg);
// Look through COPY instructions to find definition.
while (DefMI->isCopy()) {
- unsigned CopyVReg = DefMI->getOperand(1).getReg();
+ Register CopyVReg = DefMI->getOperand(1).getReg();
if (!MRI->hasOneNonDBGUse(CopyVReg))
return false;
if (!MRI->hasOneDef(CopyVReg))
@@ -4653,8 +4819,8 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
return false;
MachineOperand &MO = DefMI->getOperand(1);
- unsigned NewReg = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(NewReg))
+ Register NewReg = MO.getReg();
+ if (!Register::isVirtualRegister(NewReg))
return false;
assert(!MRI->def_empty(NewReg) && "Register must be defined.");
@@ -4737,9 +4903,13 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
static const std::pair<unsigned, const char *> TargetFlags[] = {
{MO_COFFSTUB, "aarch64-coffstub"},
- {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"},
- {MO_S, "aarch64-s"}, {MO_TLS, "aarch64-tls"},
- {MO_DLLIMPORT, "aarch64-dllimport"}};
+ {MO_GOT, "aarch64-got"},
+ {MO_NC, "aarch64-nc"},
+ {MO_S, "aarch64-s"},
+ {MO_TLS, "aarch64-tls"},
+ {MO_DLLIMPORT, "aarch64-dllimport"},
+ {MO_PREL, "aarch64-prel"},
+ {MO_TAGGED, "aarch64-tagged"}};
return makeArrayRef(TargetFlags);
}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 7be4daba7dc4..1688045e4fb8 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -15,6 +15,7 @@
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
+#include "AArch64StackOffset.h"
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/MachineCombinerPattern.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
@@ -55,8 +56,7 @@ public:
bool
areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
- const MachineInstr &MIb,
- AliasAnalysis *AA = nullptr) const override;
+ const MachineInstr &MIb) const override;
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
@@ -299,7 +299,7 @@ private:
/// if necessary, to be replaced by the scavenger at the end of PEI.
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
- int Offset, const TargetInstrInfo *TII,
+ StackOffset Offset, const TargetInstrInfo *TII,
MachineInstr::MIFlag = MachineInstr::NoFlags,
bool SetNZCV = false, bool NeedsWinCFI = false,
bool *HasWinCFI = nullptr);
@@ -308,7 +308,7 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
/// FP. Return false if the offset could not be handled directly in MI, and
/// return the left-over portion by reference.
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
- unsigned FrameReg, int &Offset,
+ unsigned FrameReg, StackOffset &Offset,
const AArch64InstrInfo *TII);
/// Use to report the frame offset status in isAArch64FrameOffsetLegal.
@@ -332,10 +332,10 @@ enum AArch64FrameOffsetStatus {
/// If set, @p EmittableOffset contains the amount that can be set in @p MI
/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
/// is a legal offset.
-int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset,
bool *OutUseUnscaledOp = nullptr,
unsigned *OutUnscaledOp = nullptr,
- int *EmittableOffset = nullptr);
+ int64_t *EmittableOffset = nullptr);
static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index eed53f36d574..1981bd5d3bf0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -62,6 +62,9 @@ def HasAM : Predicate<"Subtarget->hasAM()">,
def HasSEL2 : Predicate<"Subtarget->hasSEL2()">,
AssemblerPredicate<"FeatureSEL2", "sel2">;
+def HasPMU : Predicate<"Subtarget->hasPMU()">,
+ AssemblerPredicate<"FeaturePMU", "pmu">;
+
def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">,
AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">;
@@ -116,7 +119,7 @@ def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">,
def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">,
AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">;
def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">,
- AssemblerPredicate<"FeatureSVE2BitPerm", "bitperm">;
+ AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">;
def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
AssemblerPredicate<"FeatureRCPC", "rcpc">;
def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -133,6 +136,12 @@ def HasBTI : Predicate<"Subtarget->hasBTI()">,
AssemblerPredicate<"FeatureBranchTargetId", "bti">;
def HasMTE : Predicate<"Subtarget->hasMTE()">,
AssemblerPredicate<"FeatureMTE", "mte">;
+def HasTME : Predicate<"Subtarget->hasTME()">,
+ AssemblerPredicate<"FeatureTME", "tme">;
+def HasETE : Predicate<"Subtarget->hasETE()">,
+ AssemblerPredicate<"FeatureETE", "ete">;
+def HasTRBE : Predicate<"Subtarget->hasTRBE()">,
+ AssemblerPredicate<"FeatureTRBE", "trbe">;
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
@@ -415,6 +424,14 @@ def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, S
def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def SDT_AArch64unpk : SDTypeProfile<1, 1, [
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>
+]>;
+def AArch64sunpkhi : SDNode<"AArch64ISD::SUNPKHI", SDT_AArch64unpk>;
+def AArch64sunpklo : SDNode<"AArch64ISD::SUNPKLO", SDT_AArch64unpk>;
+def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>;
+def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
+
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
@@ -431,6 +448,13 @@ let RecomputePerFunction = 1 in {
def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
+
+ // Toggles patterns which aren't beneficial in GlobalISel when we aren't
+ // optimizing. This allows us to selectively use patterns without impacting
+ // SelectionDAG's behaviour.
+ // FIXME: One day there will probably be a nicer way to check for this, but
+ // today is not that day.
+ def OptimizedGISelOrOtherSelector : Predicate<"!MF->getFunction().hasOptNone() || MF->getProperties().hasProperty(MachineFunctionProperties::Property::FailedISel) || !MF->getProperties().hasProperty(MachineFunctionProperties::Property::Legalized)">;
}
include "AArch64InstrFormats.td"
@@ -785,7 +809,11 @@ def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
let Uses = [ X9 ], Defs = [ X16, X17, LR, NZCV ] in {
def HWASAN_CHECK_MEMACCESS : Pseudo<
(outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
- [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 imm:$accessinfo))]>,
+ [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
+ Sched<[]>;
+def HWASAN_CHECK_MEMACCESS_SHORTGRANULES : Pseudo<
+ (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
+ [(int_hwasan_check_memaccess_shortgranules X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>,
Sched<[]>;
}
@@ -804,6 +832,23 @@ def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
(SYSxt imm0_7:$op1, sys_cr_op:$Cn,
sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
+
+let Predicates = [HasTME] in {
+
+def TSTART : TMSystemI<0b0000, "tstart",
+ [(set GPR64:$Rt, (int_aarch64_tstart))]>;
+
+def TCOMMIT : TMSystemINoOperand<0b0000, "tcommit", [(int_aarch64_tcommit)]>;
+
+def TCANCEL : TMSystemException<0b011, "tcancel",
+ [(int_aarch64_tcancel i64_imm0_65535:$imm)]>;
+
+def TTEST : TMSystemI<0b0001, "ttest", [(set GPR64:$Rt, (int_aarch64_ttest))]> {
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+} // HasTME
+
//===----------------------------------------------------------------------===//
// Move immediate instructions.
//===----------------------------------------------------------------------===//
@@ -815,37 +860,37 @@ let PostEncoderMethod = "fixMOVZ" in
defm MOVZ : MoveImmediate<0b10, "movz">;
// First group of aliases covers an implicit "lsl #0".
-def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0), 0>;
-def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0), 0>;
-def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, i32_imm0_65535:$imm, 0), 0>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, i32_imm0_65535:$imm, 0), 0>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, i32_imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, i32_imm0_65535:$imm, 0)>;
// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g3:$sym, 48), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g2:$sym, 32), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g1:$sym, 16), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g0:$sym, 0), 0>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16), 0>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g1:$sym, 16), 0>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g0:$sym, 0), 0>;
// Final group of aliases covers true "mov $Rd, $imm" cases.
multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
@@ -917,8 +962,12 @@ def trunc_imm : SDNodeXForm<imm, [{
def gi_trunc_imm : GICustomOperandRenderer<"renderTruncImm">,
GISDNodeXFormEquiv<trunc_imm>;
+let Predicates = [OptimizedGISelOrOtherSelector] in {
+// The SUBREG_TO_REG isn't eliminated at -O0, which can result in pointless
+// copies.
def : Pat<(i64 i64imm_32bit:$src),
(SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
+}
// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
@@ -1012,10 +1061,10 @@ def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
(SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
let AddedComplexity = 1 in {
-def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
- (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
-def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
- (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32_i32:$R3),
+ (SUBSWrx GPR32sp:$R2, arith_extended_reg32_i32:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64_i64:$R3),
+ (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64_i64:$R3)>;
}
// Because of the immediate format for add/sub-imm instructions, the
@@ -2165,8 +2214,8 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{
if (auto *G = dyn_cast<GlobalAddressSDNode>(N)) {
const DataLayout &DL = MF->getDataLayout();
- unsigned Align = G->getGlobal()->getPointerAlignment(DL);
- return Align >= 4 && G->getOffset() % 4 == 0;
+ MaybeAlign Align = G->getGlobal()->getPointerAlignment(DL);
+ return Align && *Align >= 4 && G->getOffset() % 4 == 0;
}
if (auto *C = dyn_cast<ConstantPoolSDNode>(N))
return C->getAlignment() >= 4 && C->getOffset() % 4 == 0;
@@ -3281,20 +3330,37 @@ defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
// the NEON variant.
+
+// Here we handle first -(a + b*c) for FNMADD:
+
+let Predicates = [HasNEON, HasFullFP16] in
+def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, FPR16:$Ra)),
+ (FMSUBHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
+
def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
(FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
(FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
-// "(-a) + b*(-c)".
+// Now it's time for "(-a) + (-b)*c"
+
+let Predicates = [HasNEON, HasFullFP16] in
+def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, (fneg FPR16:$Ra))),
+ (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
+
def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
(FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
(FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+// And here "(-a) + b*(-c)"
+
+let Predicates = [HasNEON, HasFullFP16] in
+def : Pat<(f16 (fma FPR16:$Rn, (fneg FPR16:$Rm), (fneg FPR16:$Ra))),
+ (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>;
+
def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
(FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
@@ -6939,5 +7005,124 @@ def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>;
def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>;
+// Extracting lane zero is a special case where we can just use a plain
+// EXTRACT_SUBREG instruction, which will become FMOV. This is easier for the
+// rest of the compiler, especially the register allocator and copy propagation,
+// to reason about, so is preferred when it's possible to use it.
+let AddedComplexity = 10 in {
+ def : Pat<(i64 (extractelt (v2i64 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, dsub)>;
+ def : Pat<(i32 (extractelt (v4i32 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, ssub)>;
+ def : Pat<(i32 (extractelt (v2i32 V64:$V), (i64 0))), (EXTRACT_SUBREG V64:$V, ssub)>;
+}
+
+// dot_v4i8
+class mul_v4i8<SDPatternOperator ldop> :
+ PatFrag<(ops node:$Rn, node:$Rm, node:$offset),
+ (mul (ldop (add node:$Rn, node:$offset)),
+ (ldop (add node:$Rm, node:$offset)))>;
+class mulz_v4i8<SDPatternOperator ldop> :
+ PatFrag<(ops node:$Rn, node:$Rm),
+ (mul (ldop node:$Rn), (ldop node:$Rm))>;
+
+def load_v4i8 :
+ OutPatFrag<(ops node:$R),
+ (INSERT_SUBREG
+ (v2i32 (IMPLICIT_DEF)),
+ (i32 (COPY_TO_REGCLASS (LDRWui node:$R, (i64 0)), FPR32)),
+ ssub)>;
+
+class dot_v4i8<Instruction DOT, SDPatternOperator ldop> :
+ Pat<(i32 (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 3)),
+ (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 2)),
+ (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 1)),
+ (mulz_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm))))),
+ (EXTRACT_SUBREG (i64 (DOT (DUPv2i32gpr WZR),
+ (load_v4i8 GPR64sp:$Rn),
+ (load_v4i8 GPR64sp:$Rm))),
+ sub_32)>, Requires<[HasDotProd]>;
+
+// dot_v8i8
+class ee_v8i8<SDPatternOperator extend> :
+ PatFrag<(ops node:$V, node:$K),
+ (v4i16 (extract_subvector (v8i16 (extend node:$V)), node:$K))>;
+
+class mul_v8i8<SDPatternOperator mulop, SDPatternOperator extend> :
+ PatFrag<(ops node:$M, node:$N, node:$K),
+ (mulop (v4i16 (ee_v8i8<extend> node:$M, node:$K)),
+ (v4i16 (ee_v8i8<extend> node:$N, node:$K)))>;
+
+class idot_v8i8<SDPatternOperator mulop, SDPatternOperator extend> :
+ PatFrag<(ops node:$M, node:$N),
+ (i32 (extractelt
+ (v4i32 (AArch64uaddv
+ (add (mul_v8i8<mulop, extend> node:$M, node:$N, (i64 0)),
+ (mul_v8i8<mulop, extend> node:$M, node:$N, (i64 4))))),
+ (i64 0)))>;
+
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def VADDV_32 : OutPatFrag<(ops node:$R), (ADDPv2i32 node:$R, node:$R)>;
+
+class odot_v8i8<Instruction DOT> :
+ OutPatFrag<(ops node:$Vm, node:$Vn),
+ (EXTRACT_SUBREG
+ (VADDV_32
+ (i64 (DOT (DUPv2i32gpr WZR),
+ (v8i8 node:$Vm),
+ (v8i8 node:$Vn)))),
+ sub_32)>;
+
+class dot_v8i8<Instruction DOT, SDPatternOperator mulop,
+ SDPatternOperator extend> :
+ Pat<(idot_v8i8<mulop, extend> V64:$Vm, V64:$Vn),
+ (odot_v8i8<DOT> V64:$Vm, V64:$Vn)>,
+ Requires<[HasDotProd]>;
+
+// dot_v16i8
+class ee_v16i8<SDPatternOperator extend> :
+ PatFrag<(ops node:$V, node:$K1, node:$K2),
+ (v4i16 (extract_subvector
+ (v8i16 (extend
+ (v8i8 (extract_subvector node:$V, node:$K1)))), node:$K2))>;
+
+class mul_v16i8<SDPatternOperator mulop, SDPatternOperator extend> :
+ PatFrag<(ops node:$M, node:$N, node:$K1, node:$K2),
+ (v4i32
+ (mulop (v4i16 (ee_v16i8<extend> node:$M, node:$K1, node:$K2)),
+ (v4i16 (ee_v16i8<extend> node:$N, node:$K1, node:$K2))))>;
+
+class idot_v16i8<SDPatternOperator m, SDPatternOperator x> :
+ PatFrag<(ops node:$M, node:$N),
+ (i32 (extractelt
+ (v4i32 (AArch64uaddv
+ (add
+ (add (mul_v16i8<m, x> node:$M, node:$N, (i64 0), (i64 0)),
+ (mul_v16i8<m, x> node:$M, node:$N, (i64 8), (i64 0))),
+ (add (mul_v16i8<m, x> node:$M, node:$N, (i64 0), (i64 4)),
+ (mul_v16i8<m, x> node:$M, node:$N, (i64 8), (i64 4)))))),
+ (i64 0)))>;
+
+class odot_v16i8<Instruction DOT> :
+ OutPatFrag<(ops node:$Vm, node:$Vn),
+ (i32 (ADDVv4i32v
+ (DOT (DUPv4i32gpr WZR), node:$Vm, node:$Vn)))>;
+
+class dot_v16i8<Instruction DOT, SDPatternOperator mulop,
+ SDPatternOperator extend> :
+ Pat<(idot_v16i8<mulop, extend> V128:$Vm, V128:$Vn),
+ (odot_v16i8<DOT> V128:$Vm, V128:$Vn)>,
+ Requires<[HasDotProd]>;
+
+let AddedComplexity = 10 in {
+ def : dot_v4i8<SDOTv8i8, sextloadi8>;
+ def : dot_v4i8<UDOTv8i8, zextloadi8>;
+ def : dot_v8i8<SDOTv8i8, AArch64smull, sext>;
+ def : dot_v8i8<UDOTv8i8, AArch64umull, zext>;
+ def : dot_v16i8<SDOTv16i8, AArch64smull, sext>;
+ def : dot_v16i8<UDOTv16i8, AArch64umull, zext>;
+
+ // FIXME: add patterns to generate vector by element dot product.
+ // FIXME: add SVE dot-product patterns.
+}
+
include "AArch64InstrAtomics.td"
include "AArch64SVEInstrInfo.td"
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 4e13fb8e2027..961f38cad1e4 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -51,9 +51,19 @@ public:
const AArch64Subtarget &STI,
const AArch64RegisterBankInfo &RBI);
- bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ bool select(MachineInstr &I) override;
static const char *getName() { return DEBUG_TYPE; }
+ void setupMF(MachineFunction &MF, GISelKnownBits &KB,
+ CodeGenCoverage &CoverageInfo) override {
+ InstructionSelector::setupMF(MF, KB, CoverageInfo);
+
+ // hasFnAttribute() is expensive to call on every BRCOND selection, so
+ // cache it here for each run of the selector.
+ ProduceNonFlagSettingCondBr =
+ !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
+ }
+
private:
/// tblgen-erated 'select' implementation, used as the initial selector for
/// the patterns that don't require complex C++.
@@ -68,6 +78,10 @@ private:
bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ /// Eliminate same-sized cross-bank copies into stores before selectImpl().
+ void contractCrossBankCopyIntoStore(MachineInstr &I,
+ MachineRegisterInfo &MRI) const;
+
bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
@@ -101,8 +115,6 @@ private:
bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
- void collectShuffleMaskIndices(MachineInstr &I, MachineRegisterInfo &MRI,
- SmallVectorImpl<Optional<int>> &Idxs) const;
bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const;
@@ -116,6 +128,7 @@ private:
bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const;
unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
@@ -128,6 +141,8 @@ private:
MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const;
+ MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitTST(const Register &LHS, const Register &RHS,
@@ -155,7 +170,9 @@ private:
ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
+ ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
+ ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
unsigned Size) const;
@@ -183,11 +200,48 @@ private:
return selectAddrModeIndexed(Root, Width / 8);
}
+ bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
+ const MachineRegisterInfo &MRI) const;
+ ComplexRendererFns
+ selectAddrModeShiftedExtendXReg(MachineOperand &Root,
+ unsigned SizeInBytes) const;
+ ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
+ ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
+ unsigned SizeInBytes) const;
+ template <int Width>
+ ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
+ return selectAddrModeXRO(Root, Width / 8);
+ }
+
+ ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const;
+
+ ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
+ return selectShiftedRegister(Root);
+ }
+
+ ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
+ // TODO: selectShiftedRegister should allow for rotates on logical shifts.
+ // For now, make them the same. The only difference between the two is that
+ // logical shifts are allowed to fold in rotates. Otherwise, these are
+ // functionally the same.
+ return selectShiftedRegister(Root);
+ }
+
+ /// Instructions that accept extend modifiers like UXTW expect the register
+ /// being extended to be a GPR32. Narrow ExtReg to a 32-bit register using a
+ /// subregister copy if necessary. Return either ExtReg, or the result of the
+ /// new copy.
+ Register narrowExtendRegIfNeeded(Register ExtReg,
+ MachineIRBuilder &MIB) const;
+ ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
+
void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const;
+ void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I) const;
+ void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I) const;
// Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
void materializeLargeCMVal(MachineInstr &I, const Value *V,
- unsigned char OpFlags) const;
+ unsigned OpFlags) const;
// Optimization methods.
bool tryOptVectorShuffle(MachineInstr &I) const;
@@ -197,12 +251,22 @@ private:
MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const;
+ /// Return true if \p MI is a load or store of \p NumBytes bytes.
+ bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
+
+ /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
+ /// register zeroed out. In other words, the result of MI has been explicitly
+ /// zero extended.
+ bool isDef32(const MachineInstr &MI) const;
+
const AArch64TargetMachine &TM;
const AArch64Subtarget &STI;
const AArch64InstrInfo &TII;
const AArch64RegisterInfo &TRI;
const AArch64RegisterBankInfo &RBI;
+ bool ProduceNonFlagSettingCondBr = false;
+
#define GET_GLOBALISEL_PREDICATES_DECL
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATES_DECL
@@ -312,7 +376,7 @@ static bool getSubRegForClass(const TargetRegisterClass *RC,
SubReg = AArch64::hsub;
break;
case 32:
- if (RC == &AArch64::GPR32RegClass)
+ if (RC != &AArch64::FPR32RegClass)
SubReg = AArch64::sub_32;
else
SubReg = AArch64::ssub;
@@ -357,7 +421,7 @@ static bool unsupportedBinOp(const MachineInstr &I,
// so, this will need to be taught about that, and we'll need to get the
// bank out of the minimal class for the register.
// Either way, this needs to be documented (and possibly verified).
- if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+ if (!Register::isVirtualRegister(MO.getReg())) {
LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
return true;
}
@@ -492,8 +556,8 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
@@ -502,7 +566,7 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
(DstSize == SrcSize ||
// Copies are a mean to setup initial types, the number of
// bits may not exactly match.
- (TargetRegisterInfo::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
+ (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
// Copies are a mean to copy bits around, as long as we are
// on the same register class, that's fine. Otherwise, that
// means we need some SUBREG_TO_REG or AND & co.
@@ -526,7 +590,7 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
/// SubRegCopy (To class) = COPY CopyReg:SubReg
/// Dst = COPY SubRegCopy
static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI,
- const RegisterBankInfo &RBI, unsigned SrcReg,
+ const RegisterBankInfo &RBI, Register SrcReg,
const TargetRegisterClass *From,
const TargetRegisterClass *To,
unsigned SubReg) {
@@ -539,7 +603,7 @@ static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI,
// It's possible that the destination register won't be constrained. Make
// sure that happens.
- if (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()))
+ if (!Register::isPhysicalRegister(I.getOperand(0).getReg()))
RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
return true;
@@ -553,8 +617,8 @@ static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
- unsigned DstReg = I.getOperand(0).getReg();
- unsigned SrcReg = I.getOperand(1).getReg();
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
@@ -579,8 +643,8 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
- unsigned DstReg = I.getOperand(0).getReg();
- unsigned SrcReg = I.getOperand(1).getReg();
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
@@ -607,11 +671,10 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
// result.
auto CheckCopy = [&]() {
// If we have a bitcast or something, we can't have physical registers.
- assert(
- (I.isCopy() ||
- (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()) &&
- !TargetRegisterInfo::isPhysicalRegister(I.getOperand(1).getReg()))) &&
- "No phys reg on generic operator!");
+ assert((I.isCopy() ||
+ (!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
+ !Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
+ "No phys reg on generic operator!");
assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI));
(void)KnownValid;
return true;
@@ -626,38 +689,38 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
return false;
}
- // Is this a cross-bank copy?
- if (DstRegBank.getID() != SrcRegBank.getID()) {
- // If we're doing a cross-bank copy on different-sized registers, we need
- // to do a bit more work.
- unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
- unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
-
- if (SrcSize > DstSize) {
- // We're doing a cross-bank copy into a smaller register. We need a
- // subregister copy. First, get a register class that's on the same bank
- // as the destination, but the same size as the source.
- const TargetRegisterClass *SubregRC =
- getMinClassForRegBank(DstRegBank, SrcSize, true);
- assert(SubregRC && "Didn't get a register class for subreg?");
-
- // Get the appropriate subregister for the destination.
- unsigned SubReg = 0;
- if (!getSubRegForClass(DstRC, TRI, SubReg)) {
- LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n");
- return false;
- }
-
- // Now, insert a subregister copy using the new register class.
- selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg);
- return CheckCopy();
+ unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
+ unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
+
+ // If we're doing a cross-bank copy on different-sized registers, we need
+ // to do a bit more work.
+ if (SrcSize > DstSize) {
+ // We're doing a cross-bank copy into a smaller register. We need a
+ // subregister copy. First, get a register class that's on the same bank
+ // as the destination, but the same size as the source.
+ const TargetRegisterClass *SubregRC =
+ getMinClassForRegBank(DstRegBank, SrcSize, true);
+ assert(SubregRC && "Didn't get a register class for subreg?");
+
+ // Get the appropriate subregister for the destination.
+ unsigned SubReg = 0;
+ if (!getSubRegForClass(DstRC, TRI, SubReg)) {
+ LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n");
+ return false;
}
- else if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 &&
- SrcSize == 16) {
+ // Now, insert a subregister copy using the new register class.
+ selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg);
+ return CheckCopy();
+ }
+
+ // Is this a cross-bank copy?
+ if (DstRegBank.getID() != SrcRegBank.getID()) {
+ if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 &&
+ SrcSize == 16) {
// Special case for FPR16 to GPR32.
// FIXME: This can probably be generalized like the above case.
- unsigned PromoteReg =
+ Register PromoteReg =
MRI.createVirtualRegister(&AArch64::FPR32RegClass);
BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
@@ -674,7 +737,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
// If the destination is a physical register, then there's nothing to
// change, so we're done.
- if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ if (Register::isPhysicalRegister(DstReg))
return CheckCopy();
}
@@ -955,7 +1018,9 @@ bool AArch64InstructionSelector::selectVectorSHL(
return false;
unsigned Opc = 0;
- if (Ty == LLT::vector(4, 32)) {
+ if (Ty == LLT::vector(2, 64)) {
+ Opc = AArch64::USHLv2i64;
+ } else if (Ty == LLT::vector(4, 32)) {
Opc = AArch64::USHLv4i32;
} else if (Ty == LLT::vector(2, 32)) {
Opc = AArch64::USHLv2i32;
@@ -989,7 +1054,11 @@ bool AArch64InstructionSelector::selectVectorASHR(
unsigned Opc = 0;
unsigned NegOpc = 0;
const TargetRegisterClass *RC = nullptr;
- if (Ty == LLT::vector(4, 32)) {
+ if (Ty == LLT::vector(2, 64)) {
+ Opc = AArch64::SSHLv2i64;
+ NegOpc = AArch64::NEGv2i64;
+ RC = &AArch64::FPR128RegClass;
+ } else if (Ty == LLT::vector(4, 32)) {
Opc = AArch64::SSHLv4i32;
NegOpc = AArch64::NEGv4i32;
RC = &AArch64::FPR128RegClass;
@@ -1044,7 +1113,7 @@ bool AArch64InstructionSelector::selectVaStartDarwin(
}
void AArch64InstructionSelector::materializeLargeCMVal(
- MachineInstr &I, const Value *V, unsigned char OpFlags) const {
+ MachineInstr &I, const Value *V, unsigned OpFlags) const {
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1097,8 +1166,8 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
// some reason we receive input GMIR that has an s64 shift amount that's not
// a G_CONSTANT, insert a truncate so that we can still select the s32
// register-register variant.
- unsigned SrcReg = I.getOperand(1).getReg();
- unsigned ShiftReg = I.getOperand(2).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
+ Register ShiftReg = I.getOperand(2).getReg();
const LLT ShiftTy = MRI.getType(ShiftReg);
const LLT SrcTy = MRI.getType(SrcReg);
if (SrcTy.isVector())
@@ -1118,6 +1187,9 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
}
return;
}
+ case TargetOpcode::G_STORE:
+ contractCrossBankCopyIntoStore(I, MRI);
+ return;
default:
return;
}
@@ -1158,6 +1230,48 @@ bool AArch64InstructionSelector::earlySelectSHL(
return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
}
+void AArch64InstructionSelector::contractCrossBankCopyIntoStore(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
+ // If we're storing a scalar, it doesn't matter what register bank that
+ // scalar is on. All that matters is the size.
+ //
+ // So, if we see something like this (with a 32-bit scalar as an example):
+ //
+ // %x:gpr(s32) = ... something ...
+ // %y:fpr(s32) = COPY %x:gpr(s32)
+ // G_STORE %y:fpr(s32)
+ //
+ // We can fix this up into something like this:
+ //
+ // G_STORE %x:gpr(s32)
+ //
+ // And then continue the selection process normally.
+ MachineInstr *Def = getDefIgnoringCopies(I.getOperand(0).getReg(), MRI);
+ if (!Def)
+ return;
+ Register DefDstReg = Def->getOperand(0).getReg();
+ LLT DefDstTy = MRI.getType(DefDstReg);
+ Register StoreSrcReg = I.getOperand(0).getReg();
+ LLT StoreSrcTy = MRI.getType(StoreSrcReg);
+
+ // If we get something strange like a physical register, then we shouldn't
+ // go any further.
+ if (!DefDstTy.isValid())
+ return;
+
+ // Are the source and dst types the same size?
+ if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
+ return;
+
+ if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
+ RBI.getRegBank(DefDstReg, MRI, TRI))
+ return;
+
+ // We have a cross-bank copy, which is entering a store. Let's fold it.
+ I.getOperand(0).setReg(DefDstReg);
+}
+
bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -1169,13 +1283,37 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
switch (I.getOpcode()) {
case TargetOpcode::G_SHL:
return earlySelectSHL(I, MRI);
+ case TargetOpcode::G_CONSTANT: {
+ bool IsZero = false;
+ if (I.getOperand(1).isCImm())
+ IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0;
+ else if (I.getOperand(1).isImm())
+ IsZero = I.getOperand(1).getImm() == 0;
+
+ if (!IsZero)
+ return false;
+
+ Register DefReg = I.getOperand(0).getReg();
+ LLT Ty = MRI.getType(DefReg);
+ if (Ty != LLT::scalar(64) && Ty != LLT::scalar(32))
+ return false;
+
+ if (Ty == LLT::scalar(64)) {
+ I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
+ RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
+ } else {
+ I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
+ RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
+ }
+ I.setDesc(TII.get(TargetOpcode::COPY));
+ return true;
+ }
default:
return false;
}
}
-bool AArch64InstructionSelector::select(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
+bool AArch64InstructionSelector::select(MachineInstr &I) {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -1244,7 +1382,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
if (earlySelect(I))
return true;
- if (selectImpl(I, CoverageInfo))
+ if (selectImpl(I, *CoverageInfo))
return true;
LLT Ty =
@@ -1439,14 +1577,43 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return true;
}
case TargetOpcode::G_EXTRACT: {
- LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
- LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
+ LLT SrcTy = MRI.getType(SrcReg);
+ LLT DstTy = MRI.getType(DstReg);
(void)DstTy;
unsigned SrcSize = SrcTy.getSizeInBits();
- // Larger extracts are vectors, same-size extracts should be something else
- // by now (either split up or simplified to a COPY).
- if (SrcTy.getSizeInBits() > 64 || Ty.getSizeInBits() > 32)
- return false;
+
+ if (SrcTy.getSizeInBits() > 64) {
+ // This should be an extract of an s128, which is like a vector extract.
+ if (SrcTy.getSizeInBits() != 128)
+ return false;
+ // Only support extracting 64 bits from an s128 at the moment.
+ if (DstTy.getSizeInBits() != 64)
+ return false;
+
+ const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+ const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+ // Check we have the right regbank always.
+ assert(SrcRB.getID() == AArch64::FPRRegBankID &&
+ DstRB.getID() == AArch64::FPRRegBankID &&
+ "Wrong extract regbank!");
+ (void)SrcRB;
+
+ // Emit the same code as a vector extract.
+ // Offset must be a multiple of 64.
+ unsigned Offset = I.getOperand(2).getImm();
+ if (Offset % 64 != 0)
+ return false;
+ unsigned LaneIdx = Offset / 64;
+ MachineIRBuilder MIB(I);
+ MachineInstr *Extract = emitExtractVectorElt(
+ DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
+ if (!Extract)
+ return false;
+ I.eraseFromParent();
+ return true;
+ }
I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
@@ -1458,7 +1625,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
- Register DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
+ DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
.addReg(DstReg, 0, AArch64::sub_32);
@@ -1521,11 +1688,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_GLOBAL_VALUE: {
auto GV = I.getOperand(1).getGlobal();
- if (GV->isThreadLocal()) {
- // FIXME: we don't support TLS yet.
- return false;
- }
- unsigned char OpFlags = STI.ClassifyGlobalReference(GV, TM);
+ if (GV->isThreadLocal())
+ return selectTLSGlobalValue(I, MRI);
+
+ unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
if (OpFlags & AArch64II::MO_GOT) {
I.setDesc(TII.get(AArch64::LOADgot));
I.getOperand(1).setTargetFlags(OpFlags);
@@ -1562,8 +1728,15 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
}
auto &MemOp = **I.memoperands_begin();
- if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
- LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+ if (MemOp.isAtomic()) {
+ // For now we just support s8 acquire loads to be able to compile stack
+ // protector code.
+ if (MemOp.getOrdering() == AtomicOrdering::Acquire &&
+ MemOp.getSize() == 1) {
+ I.setDesc(TII.get(AArch64::LDARB));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+ LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n");
return false;
}
unsigned MemSizeInBits = MemOp.getSize() * 8;
@@ -1598,7 +1771,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
const unsigned Size = MemSizeInBits / 8;
const unsigned Scale = Log2_32(Size);
if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
- unsigned Ptr2Reg = PtrMI->getOperand(1).getReg();
+ Register Ptr2Reg = PtrMI->getOperand(1).getReg();
I.getOperand(1).setReg(Ptr2Reg);
PtrMI = MRI.getVRegDef(Ptr2Reg);
Offset = Imm / Size;
@@ -1688,8 +1861,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return selectVectorSHL(I, MRI);
LLVM_FALLTHROUGH;
case TargetOpcode::G_OR:
- case TargetOpcode::G_LSHR:
- case TargetOpcode::G_GEP: {
+ case TargetOpcode::G_LSHR: {
// Reject the various things we don't support yet.
if (unsupportedBinOp(I, RBI, MRI, TRI))
return false;
@@ -1711,6 +1883,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
+ case TargetOpcode::G_GEP: {
+ MachineIRBuilder MIRBuilder(I);
+ emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2),
+ MIRBuilder);
+ I.eraseFromParent();
+ return true;
+ }
case TargetOpcode::G_UADDO: {
// TODO: Support other types.
unsigned OpSize = Ty.getSizeInBits();
@@ -1816,6 +1995,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return true;
}
+
+ if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
+ MachineIRBuilder MIB(I);
+ MachineInstr *Extract = emitExtractVectorElt(
+ DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
+ if (!Extract)
+ return false;
+ I.eraseFromParent();
+ return true;
+ }
}
return false;
@@ -1868,21 +2057,41 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_ZEXT:
case TargetOpcode::G_SEXT: {
unsigned Opcode = I.getOpcode();
- const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
- SrcTy = MRI.getType(I.getOperand(1).getReg());
- const bool isSigned = Opcode == TargetOpcode::G_SEXT;
+ const bool IsSigned = Opcode == TargetOpcode::G_SEXT;
const Register DefReg = I.getOperand(0).getReg();
const Register SrcReg = I.getOperand(1).getReg();
- const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+ const LLT DstTy = MRI.getType(DefReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ unsigned DstSize = DstTy.getSizeInBits();
+ unsigned SrcSize = SrcTy.getSizeInBits();
- if (RB.getID() != AArch64::GPRRegBankID) {
- LLVM_DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB
- << ", expected: GPR\n");
- return false;
- }
+ assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
+ AArch64::GPRRegBankID &&
+ "Unexpected ext regbank");
+ MachineIRBuilder MIB(I);
MachineInstr *ExtI;
- if (DstTy == LLT::scalar(64)) {
+ if (DstTy.isVector())
+ return false; // Should be handled by imported patterns.
+
+ // First check if we're extending the result of a load which has a dest type
+ // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
+ // GPR register on AArch64 and all loads which are smaller automatically
+ // zero-extend the upper bits. E.g.
+ // %v(s8) = G_LOAD %p, :: (load 1)
+ // %v2(s32) = G_ZEXT %v(s8)
+ if (!IsSigned) {
+ auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
+ if (LoadMI &&
+ RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID) {
+ const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
+ unsigned BytesLoaded = MemOp->getSize();
+ if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
+ return selectCopy(I, TII, MRI, TRI, RBI);
+ }
+ }
+
+ if (DstSize == 64) {
// FIXME: Can we avoid manually doing this?
if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
@@ -1890,33 +2099,26 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
return false;
}
- const Register SrcXReg =
- MRI.createVirtualRegister(&AArch64::GPR64RegClass);
- BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
- .addDef(SrcXReg)
- .addImm(0)
- .addUse(SrcReg)
- .addImm(AArch64::sub_32);
-
- const unsigned NewOpc = isSigned ? AArch64::SBFMXri : AArch64::UBFMXri;
- ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
- .addDef(DefReg)
- .addUse(SrcXReg)
- .addImm(0)
- .addImm(SrcTy.getSizeInBits() - 1);
- } else if (DstTy.isScalar() && DstTy.getSizeInBits() <= 32) {
- const unsigned NewOpc = isSigned ? AArch64::SBFMWri : AArch64::UBFMWri;
- ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
- .addDef(DefReg)
- .addUse(SrcReg)
- .addImm(0)
- .addImm(SrcTy.getSizeInBits() - 1);
+ auto SubregToReg =
+ MIB.buildInstr(AArch64::SUBREG_TO_REG, {&AArch64::GPR64RegClass}, {})
+ .addImm(0)
+ .addUse(SrcReg)
+ .addImm(AArch64::sub_32);
+
+ ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
+ {DefReg}, {SubregToReg})
+ .addImm(0)
+ .addImm(SrcSize - 1);
+ } else if (DstSize <= 32) {
+ ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
+ {DefReg}, {SrcReg})
+ .addImm(0)
+ .addImm(SrcSize - 1);
} else {
return false;
}
constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
-
I.eraseFromParent();
return true;
}
@@ -2163,6 +2365,37 @@ bool AArch64InstructionSelector::selectJumpTable(
return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
}
+bool AArch64InstructionSelector::selectTLSGlobalValue(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ if (!STI.isTargetMachO())
+ return false;
+ MachineFunction &MF = *I.getParent()->getParent();
+ MF.getFrameInfo().setAdjustsStack(true);
+
+ const GlobalValue &GV = *I.getOperand(1).getGlobal();
+ MachineIRBuilder MIB(I);
+
+ MIB.buildInstr(AArch64::LOADgot, {AArch64::X0}, {})
+ .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
+
+ auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
+ {Register(AArch64::X0)})
+ .addImm(0);
+
+ // TLS calls preserve all registers except those that absolutely must be
+ // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+ // silly).
+ MIB.buildInstr(AArch64::BLR, {}, {Load})
+ .addDef(AArch64::X0, RegState::Implicit)
+ .addRegMask(TRI.getTLSCallPreservedMask());
+
+ MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
+ RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
+ MRI);
+ I.eraseFromParent();
+ return true;
+}
+
bool AArch64InstructionSelector::selectIntrinsicTrunc(
MachineInstr &I, MachineRegisterInfo &MRI) const {
const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
@@ -2478,16 +2711,40 @@ bool AArch64InstructionSelector::selectMergeValues(
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
+ const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
- // At the moment we only support merging two s32s into an s64.
if (I.getNumOperands() != 3)
return false;
- if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
- return false;
- const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
+
+ // Merging 2 s64s into an s128.
+ if (DstTy == LLT::scalar(128)) {
+ if (SrcTy.getSizeInBits() != 64)
+ return false;
+ MachineIRBuilder MIB(I);
+ Register DstReg = I.getOperand(0).getReg();
+ Register Src1Reg = I.getOperand(1).getReg();
+ Register Src2Reg = I.getOperand(2).getReg();
+ auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
+ MachineInstr *InsMI =
+ emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB);
+ if (!InsMI)
+ return false;
+ MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
+ Src2Reg, /* LaneIdx */ 1, RB, MIB);
+ if (!Ins2MI)
+ return false;
+ constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+ }
+
if (RB.getID() != AArch64::GPRRegBankID)
return false;
+ if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
+ return false;
+
auto *DstRC = &AArch64::GPR64RegClass;
Register SubToRegDef = MRI.createVirtualRegister(DstRC);
MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -2695,7 +2952,8 @@ bool AArch64InstructionSelector::selectUnmergeValues(
const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
const LLT WideTy = MRI.getType(SrcReg);
(void)WideTy;
- assert(WideTy.isVector() && "can only unmerge from vector types!");
+ assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
+ "can only unmerge from vector or s128 types!");
assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
"source register size too small!");
@@ -2802,29 +3060,6 @@ bool AArch64InstructionSelector::selectConcatVectors(
return true;
}
-void AArch64InstructionSelector::collectShuffleMaskIndices(
- MachineInstr &I, MachineRegisterInfo &MRI,
- SmallVectorImpl<Optional<int>> &Idxs) const {
- MachineInstr *MaskDef = MRI.getVRegDef(I.getOperand(3).getReg());
- assert(
- MaskDef->getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
- "G_SHUFFLE_VECTOR should have a constant mask operand as G_BUILD_VECTOR");
- // Find the constant indices.
- for (unsigned i = 1, e = MaskDef->getNumOperands(); i < e; ++i) {
- // Look through copies.
- MachineInstr *ScalarDef =
- getDefIgnoringCopies(MaskDef->getOperand(i).getReg(), MRI);
- assert(ScalarDef && "Could not find vreg def of shufflevec index op");
- if (ScalarDef->getOpcode() != TargetOpcode::G_CONSTANT) {
- // This be an undef if not a constant.
- assert(ScalarDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF);
- Idxs.push_back(None);
- } else {
- Idxs.push_back(ScalarDef->getOperand(1).getCImm()->getSExtValue());
- }
- }
-}
-
unsigned
AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal,
MachineFunction &MF) const {
@@ -2906,6 +3141,31 @@ getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
}
MachineInstr *
+AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
+ MachineOperand &RHS,
+ MachineIRBuilder &MIRBuilder) const {
+ assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
+ MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+ static const unsigned OpcTable[2][2]{{AArch64::ADDXrr, AArch64::ADDXri},
+ {AArch64::ADDWrr, AArch64::ADDWri}};
+ bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32;
+ auto ImmFns = selectArithImmed(RHS);
+ unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
+ auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS.getReg()});
+
+ // If we matched a valid constant immediate, add those operands.
+ if (ImmFns) {
+ for (auto &RenderFn : *ImmFns)
+ RenderFn(AddMI);
+ } else {
+ AddMI.addUse(RHS.getReg());
+ }
+
+ constrainSelectedInstRegOperands(*AddMI, TII, TRI, RBI);
+ return &*AddMI;
+}
+
+MachineInstr *
AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
@@ -3151,7 +3411,7 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
// Can't see past copies from physregs.
if (Opc == TargetOpcode::COPY &&
- TargetRegisterInfo::isPhysicalRegister(CondDef->getOperand(1).getReg()))
+ Register::isPhysicalRegister(CondDef->getOperand(1).getReg()))
return false;
CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg());
@@ -3342,16 +3602,9 @@ bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const {
return false;
// The shuffle's second operand doesn't matter if the mask is all zero.
- auto *ZeroVec = getOpcodeDef(G_BUILD_VECTOR, I.getOperand(3).getReg(), MRI);
- if (!ZeroVec)
+ const Constant *Mask = I.getOperand(3).getShuffleMask();
+ if (!isa<ConstantAggregateZero>(Mask))
return false;
- int64_t Zero = 0;
- if (!mi_match(ZeroVec->getOperand(1).getReg(), MRI, m_ICst(Zero)) || Zero)
- return false;
- for (unsigned i = 1, e = ZeroVec->getNumOperands() - 1; i < e; ++i) {
- if (ZeroVec->getOperand(i).getReg() != ZeroVec->getOperand(1).getReg())
- return false; // This wasn't an all zeros vector.
- }
// We're done, now find out what kind of splat we need.
LLT VecTy = MRI.getType(I.getOperand(0).getReg());
@@ -3399,19 +3652,14 @@ bool AArch64InstructionSelector::selectShuffleVector(
const LLT Src1Ty = MRI.getType(Src1Reg);
Register Src2Reg = I.getOperand(2).getReg();
const LLT Src2Ty = MRI.getType(Src2Reg);
+ const Constant *ShuffleMask = I.getOperand(3).getShuffleMask();
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
LLVMContext &Ctx = MF.getFunction().getContext();
- // G_SHUFFLE_VECTOR doesn't really have a strictly enforced constant mask
- // operand, it comes in as a normal vector value which we have to analyze to
- // find the mask indices. If the mask element is undef, then
- // collectShuffleMaskIndices() will add a None entry for that index into
- // the list.
- SmallVector<Optional<int>, 8> Mask;
- collectShuffleMaskIndices(I, MRI, Mask);
- assert(!Mask.empty() && "Expected to find mask indices");
+ SmallVector<int, 8> Mask;
+ ShuffleVectorInst::getShuffleMask(ShuffleMask, Mask);
// G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
// it's originated from a <1 x T> type. Those should have been lowered into
@@ -3424,10 +3672,10 @@ bool AArch64InstructionSelector::selectShuffleVector(
unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
SmallVector<Constant *, 64> CstIdxs;
- for (auto &MaybeVal : Mask) {
+ for (int Val : Mask) {
// For now, any undef indexes we'll just assume to be 0. This should be
// optimized in future, e.g. to select DUP etc.
- int Val = MaybeVal.hasValue() ? *MaybeVal : 0;
+ Val = Val < 0 ? 0 : Val;
for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
unsigned Offset = Byte + Val * BytesPerElt;
CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
@@ -3684,21 +3932,6 @@ static unsigned findIntrinsicID(MachineInstr &I) {
return IntrinOp->getIntrinsicID();
}
-/// Helper function to emit the correct opcode for a llvm.aarch64.stlxr
-/// intrinsic.
-static unsigned getStlxrOpcode(unsigned NumBytesToStore) {
- switch (NumBytesToStore) {
- // TODO: 1, 2, and 4 byte stores.
- case 8:
- return AArch64::STLXRX;
- default:
- LLVM_DEBUG(dbgs() << "Unexpected number of bytes to store! ("
- << NumBytesToStore << ")\n");
- break;
- }
- return 0;
-}
-
bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
MachineInstr &I, MachineRegisterInfo &MRI) const {
// Find the intrinsic ID.
@@ -3719,32 +3952,6 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
return false;
MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
break;
- case Intrinsic::aarch64_stlxr:
- Register StatReg = I.getOperand(0).getReg();
- assert(RBI.getSizeInBits(StatReg, MRI, TRI) == 32 &&
- "Status register must be 32 bits!");
- Register SrcReg = I.getOperand(2).getReg();
-
- if (RBI.getSizeInBits(SrcReg, MRI, TRI) != 64) {
- LLVM_DEBUG(dbgs() << "Only support 64-bit sources right now.\n");
- return false;
- }
-
- Register PtrReg = I.getOperand(3).getReg();
- assert(MRI.getType(PtrReg).isPointer() && "Expected pointer operand");
-
- // Expect only one memory operand.
- if (!I.hasOneMemOperand())
- return false;
-
- const MachineMemOperand *MemOp = *I.memoperands_begin();
- unsigned NumBytesToStore = MemOp->getSize();
- unsigned Opc = getStlxrOpcode(NumBytesToStore);
- if (!Opc)
- return false;
-
- auto StoreMI = MIRBuilder.buildInstr(Opc, {StatReg}, {SrcReg, PtrReg});
- constrainSelectedInstRegOperands(*StoreMI, TII, TRI, RBI);
}
I.eraseFromParent();
@@ -3860,6 +4067,30 @@ AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
}
+/// Helper to select an immediate value that can be represented as a 12-bit
+/// value shifted left by either 0 or 12. If it is possible to do so, return
+/// the immediate and shift value. If not, return None.
+///
+/// Used by selectArithImmed and selectNegArithImmed.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::select12BitValueWithLeftShift(
+ uint64_t Immed) const {
+ unsigned ShiftAmt;
+ if (Immed >> 12 == 0) {
+ ShiftAmt = 0;
+ } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+ ShiftAmt = 12;
+ Immed = Immed >> 12;
+ } else
+ return None;
+
+ unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
+ }};
+}
+
/// SelectArithImmed - Select an immediate value that can be represented as
/// a 12-bit value shifted left by either 0 or 12. If so, return true with
/// Val set to the 12-bit value and Shift set to the shifter operand.
@@ -3873,22 +4104,229 @@ AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None)
return None;
+ return select12BitValueWithLeftShift(*MaybeImmed);
+}
+
+/// SelectNegArithImmed - As above, but negates the value before trying to
+/// select it.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
+ // We need a register here, because we need to know if we have a 64 or 32
+ // bit immediate.
+ if (!Root.isReg())
+ return None;
+ auto MaybeImmed = getImmedFromMO(Root);
+ if (MaybeImmed == None)
+ return None;
uint64_t Immed = *MaybeImmed;
- unsigned ShiftAmt;
- if (Immed >> 12 == 0) {
- ShiftAmt = 0;
- } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
- ShiftAmt = 12;
- Immed = Immed >> 12;
- } else
+ // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
+ // have the opposite effect on the C flag, so this pattern mustn't match under
+ // those circumstances.
+ if (Immed == 0)
return None;
- unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
- }};
+ // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
+ // the root.
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+ if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
+ Immed = ~((uint32_t)Immed) + 1;
+ else
+ Immed = ~Immed + 1ULL;
+
+ if (Immed & 0xFFFFFFFFFF000000ULL)
+ return None;
+
+ Immed &= 0xFFFFFFULL;
+ return select12BitValueWithLeftShift(Immed);
+}
+
+/// Return true if it is worth folding MI into an extended register. That is,
+/// if it's safe to pull it into the addressing mode of a load or store as a
+/// shift.
+bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
+ MachineInstr &MI, const MachineRegisterInfo &MRI) const {
+ // Always fold if there is one use, or if we're optimizing for size.
+ Register DefReg = MI.getOperand(0).getReg();
+ if (MRI.hasOneUse(DefReg) ||
+ MI.getParent()->getParent()->getFunction().hasMinSize())
+ return true;
+
+ // It's better to avoid folding and recomputing shifts when we don't have a
+ // fastpath.
+ if (!STI.hasLSLFast())
+ return false;
+
+ // We have a fastpath, so folding a shift in and potentially computing it
+ // many times may be beneficial. Check if this is only used in memory ops.
+ // If it is, then we should fold.
+ return all_of(MRI.use_instructions(DefReg),
+ [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
+}
+
+/// This is used for computing addresses like this:
+///
+/// ldr x1, [x2, x3, lsl #3]
+///
+/// Where x2 is the base register, and x3 is an offset register. The shift-left
+/// is a constant value specific to this load instruction. That is, we'll never
+/// see anything other than a 3 here (which corresponds to the size of the
+/// element being loaded.)
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
+ MachineOperand &Root, unsigned SizeInBytes) const {
+ if (!Root.isReg())
+ return None;
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+ // Make sure that the memory op is a valid size.
+ int64_t LegalShiftVal = Log2_32(SizeInBytes);
+ if (LegalShiftVal == 0)
+ return None;
+
+ // We want to find something like this:
+ //
+ // val = G_CONSTANT LegalShiftVal
+ // shift = G_SHL off_reg val
+ // ptr = G_GEP base_reg shift
+ // x = G_LOAD ptr
+ //
+ // And fold it into this addressing mode:
+ //
+ // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
+
+ // Check if we can find the G_GEP.
+ MachineInstr *Gep = getOpcodeDef(TargetOpcode::G_GEP, Root.getReg(), MRI);
+ if (!Gep || !isWorthFoldingIntoExtendedReg(*Gep, MRI))
+ return None;
+
+ // Now, try to match an opcode which will match our specific offset.
+ // We want a G_SHL or a G_MUL.
+ MachineInstr *OffsetInst = getDefIgnoringCopies(Gep->getOperand(2).getReg(), MRI);
+ if (!OffsetInst)
+ return None;
+
+ unsigned OffsetOpc = OffsetInst->getOpcode();
+ if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
+ return None;
+
+ if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
+ return None;
+
+ // Now, try to find the specific G_CONSTANT. Start by assuming that the
+ // register we will offset is the LHS, and the register containing the
+ // constant is the RHS.
+ Register OffsetReg = OffsetInst->getOperand(1).getReg();
+ Register ConstantReg = OffsetInst->getOperand(2).getReg();
+ auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+ if (!ValAndVReg) {
+ // We didn't get a constant on the RHS. If the opcode is a shift, then
+ // we're done.
+ if (OffsetOpc == TargetOpcode::G_SHL)
+ return None;
+
+ // If we have a G_MUL, we can use either register. Try looking at the RHS.
+ std::swap(OffsetReg, ConstantReg);
+ ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+ if (!ValAndVReg)
+ return None;
+ }
+
+ // The value must fit into 3 bits, and must be positive. Make sure that is
+ // true.
+ int64_t ImmVal = ValAndVReg->Value;
+
+ // Since we're going to pull this into a shift, the constant value must be
+ // a power of 2. If we got a multiply, then we need to check this.
+ if (OffsetOpc == TargetOpcode::G_MUL) {
+ if (!isPowerOf2_32(ImmVal))
+ return None;
+
+ // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
+ ImmVal = Log2_32(ImmVal);
+ }
+
+ if ((ImmVal & 0x7) != ImmVal)
+ return None;
+
+ // We are only allowed to shift by LegalShiftVal. This shift value is built
+ // into the instruction, so we can't just use whatever we want.
+ if (ImmVal != LegalShiftVal)
+ return None;
+
+ // We can use the LHS of the GEP as the base, and the LHS of the shift as an
+ // offset. Signify that we are shifting by setting the shift flag to 1.
+ return {{[=](MachineInstrBuilder &MIB) {
+ MIB.addUse(Gep->getOperand(1).getReg());
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
+ [=](MachineInstrBuilder &MIB) {
+ // Need to add both immediates here to make sure that they are both
+ // added to the instruction.
+ MIB.addImm(0);
+ MIB.addImm(1);
+ }}};
+}
+
+/// This is used for computing addresses like this:
+///
+/// ldr x1, [x2, x3]
+///
+/// Where x2 is the base register, and x3 is an offset register.
+///
+/// When possible (or profitable) to fold a G_GEP into the address calculation,
+/// this will do so. Otherwise, it will return None.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeRegisterOffset(
+ MachineOperand &Root) const {
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+ // We need a GEP.
+ MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
+ if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP)
+ return None;
+
+ // If this is used more than once, let's not bother folding.
+ // TODO: Check if they are memory ops. If they are, then we can still fold
+ // without having to recompute anything.
+ if (!MRI.hasOneUse(Gep->getOperand(0).getReg()))
+ return None;
+
+ // Base is the GEP's LHS, offset is its RHS.
+ return {{[=](MachineInstrBuilder &MIB) {
+ MIB.addUse(Gep->getOperand(1).getReg());
+ },
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addUse(Gep->getOperand(2).getReg());
+ },
+ [=](MachineInstrBuilder &MIB) {
+ // Need to add both immediates here to make sure that they are both
+ // added to the instruction.
+ MIB.addImm(0);
+ MIB.addImm(0);
+ }}};
+}
+
+/// This is intended to be equivalent to selectAddrModeXRO in
+/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
+ unsigned SizeInBytes) const {
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+ // If we have a constant offset, then we probably don't want to match a
+ // register offset.
+ if (isBaseWithConstantOffset(Root, MRI))
+ return None;
+
+ // Try to fold shifts into the addressing mode.
+ auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
+ if (AddrModeFns)
+ return AddrModeFns;
+
+ // If that doesn't work, see if it's possible to fold in registers from
+ // a GEP.
+ return selectAddrModeRegisterOffset(Root);
}
/// Select a "register plus unscaled signed 9-bit immediate" address. This
@@ -3994,6 +4432,205 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
}};
}
+/// Given a shift instruction, return the correct shift type for that
+/// instruction.
+static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
+ // TODO: Handle AArch64_AM::ROR
+ switch (MI.getOpcode()) {
+ default:
+ return AArch64_AM::InvalidShiftExtend;
+ case TargetOpcode::G_SHL:
+ return AArch64_AM::LSL;
+ case TargetOpcode::G_LSHR:
+ return AArch64_AM::LSR;
+ case TargetOpcode::G_ASHR:
+ return AArch64_AM::ASR;
+ }
+}
+
+/// Select a "shifted register" operand. If the value is not shifted, set the
+/// shift operand to a default value of "lsl 0".
+///
+/// TODO: Allow shifted register to be rotated in logical instructions.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const {
+ if (!Root.isReg())
+ return None;
+ MachineRegisterInfo &MRI =
+ Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ // Check if the operand is defined by an instruction which corresponds to
+ // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
+ //
+ // TODO: Handle AArch64_AM::ROR for logical instructions.
+ MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
+ if (!ShiftInst)
+ return None;
+ AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
+ if (ShType == AArch64_AM::InvalidShiftExtend)
+ return None;
+ if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
+ return None;
+
+ // Need an immediate on the RHS.
+ MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
+ auto Immed = getImmedFromMO(ShiftRHS);
+ if (!Immed)
+ return None;
+
+ // We have something that we can fold. Fold in the shift's LHS and RHS into
+ // the instruction.
+ MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
+ Register ShiftReg = ShiftLHS.getReg();
+
+ unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
+ unsigned Val = *Immed & (NumBits - 1);
+ unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
+}
+
+/// Get the correct ShiftExtendType for an extend instruction.
+static AArch64_AM::ShiftExtendType
+getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI) {
+ unsigned Opc = MI.getOpcode();
+
+ // Handle explicit extend instructions first.
+ if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
+ unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ assert(Size != 64 && "Extend from 64 bits?");
+ switch (Size) {
+ case 8:
+ return AArch64_AM::SXTB;
+ case 16:
+ return AArch64_AM::SXTH;
+ case 32:
+ return AArch64_AM::SXTW;
+ default:
+ return AArch64_AM::InvalidShiftExtend;
+ }
+ }
+
+ if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
+ unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ assert(Size != 64 && "Extend from 64 bits?");
+ switch (Size) {
+ case 8:
+ return AArch64_AM::UXTB;
+ case 16:
+ return AArch64_AM::UXTH;
+ case 32:
+ return AArch64_AM::UXTW;
+ default:
+ return AArch64_AM::InvalidShiftExtend;
+ }
+ }
+
+ // Don't have an explicit extend. Try to handle a G_AND with a constant mask
+ // on the RHS.
+ if (Opc != TargetOpcode::G_AND)
+ return AArch64_AM::InvalidShiftExtend;
+
+ Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
+ if (!MaybeAndMask)
+ return AArch64_AM::InvalidShiftExtend;
+ uint64_t AndMask = *MaybeAndMask;
+ switch (AndMask) {
+ default:
+ return AArch64_AM::InvalidShiftExtend;
+ case 0xFF:
+ return AArch64_AM::UXTB;
+ case 0xFFFF:
+ return AArch64_AM::UXTH;
+ case 0xFFFFFFFF:
+ return AArch64_AM::UXTW;
+ }
+}
+
+Register AArch64InstructionSelector::narrowExtendRegIfNeeded(
+ Register ExtReg, MachineIRBuilder &MIB) const {
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+ if (MRI.getType(ExtReg).getSizeInBits() == 32)
+ return ExtReg;
+
+ // Insert a copy to move ExtReg to GPR32.
+ Register NarrowReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+ auto Copy = MIB.buildCopy({NarrowReg}, {ExtReg});
+
+ // Select the copy into a subregister copy.
+ selectCopy(*Copy, TII, MRI, TRI, RBI);
+ return Copy.getReg(0);
+}
+
+/// Select an "extended register" operand. This operand folds in an extend
+/// followed by an optional left shift.
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectArithExtendedRegister(
+ MachineOperand &Root) const {
+ if (!Root.isReg())
+ return None;
+ MachineRegisterInfo &MRI =
+ Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ uint64_t ShiftVal = 0;
+ Register ExtReg;
+ AArch64_AM::ShiftExtendType Ext;
+ MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
+ if (!RootDef)
+ return None;
+
+ if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
+ return None;
+
+ // Check if we can fold a shift and an extend.
+ if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
+ // Look for a constant on the RHS of the shift.
+ MachineOperand &RHS = RootDef->getOperand(2);
+ Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
+ if (!MaybeShiftVal)
+ return None;
+ ShiftVal = *MaybeShiftVal;
+ if (ShiftVal > 4)
+ return None;
+ // Look for a valid extend instruction on the LHS of the shift.
+ MachineOperand &LHS = RootDef->getOperand(1);
+ MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
+ if (!ExtDef)
+ return None;
+ Ext = getExtendTypeForInst(*ExtDef, MRI);
+ if (Ext == AArch64_AM::InvalidShiftExtend)
+ return None;
+ ExtReg = ExtDef->getOperand(1).getReg();
+ } else {
+ // Didn't get a shift. Try just folding an extend.
+ Ext = getExtendTypeForInst(*RootDef, MRI);
+ if (Ext == AArch64_AM::InvalidShiftExtend)
+ return None;
+ ExtReg = RootDef->getOperand(1).getReg();
+
+ // If we have a 32 bit instruction which zeroes out the high half of a
+ // register, we get an implicit zero extend for free. Check if we have one.
+ // FIXME: We actually emit the extend right now even though we don't have
+ // to.
+ if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
+ MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
+ if (ExtInst && isDef32(*ExtInst))
+ return None;
+ }
+ }
+
+ // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
+ // copy.
+ MachineIRBuilder MIB(*RootDef);
+ ExtReg = narrowExtendRegIfNeeded(ExtReg, MIB);
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addImm(getArithExtendImm(Ext, ShiftVal));
+ }}};
+}
+
void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
@@ -4003,6 +4640,51 @@ void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
MIB.addImm(CstVal.getValue());
}
+void AArch64InstructionSelector::renderLogicalImm32(
+ MachineInstrBuilder &MIB, const MachineInstr &I) const {
+ assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+ uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
+ uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
+ MIB.addImm(Enc);
+}
+
+void AArch64InstructionSelector::renderLogicalImm64(
+ MachineInstrBuilder &MIB, const MachineInstr &I) const {
+ assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+ uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
+ uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
+ MIB.addImm(Enc);
+}
+
+bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
+ const MachineInstr &MI, unsigned NumBytes) const {
+ if (!MI.mayLoadOrStore())
+ return false;
+ assert(MI.hasOneMemOperand() &&
+ "Expected load/store to have only one mem op!");
+ return (*MI.memoperands_begin())->getSize() == NumBytes;
+}
+
+bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
+ return false;
+
+ // Only return true if we know the operation will zero-out the high half of
+ // the 64-bit register. Truncates can be subregister copies, which don't
+ // zero out the high bits. Copies and other copy-like instructions can be
+ // fed by truncates, or could be lowered as subregister copies.
+ switch (MI.getOpcode()) {
+ default:
+ return true;
+ case TargetOpcode::COPY:
+ case TargetOpcode::G_BITCAST:
+ case TargetOpcode::G_TRUNC:
+ case TargetOpcode::G_PHI:
+ return false;
+ }
+}
+
namespace llvm {
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &TM,
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index a985b330eafa..7a1901bd5b1e 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -13,7 +13,9 @@
#include "AArch64LegalizerInfo.h"
#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
@@ -50,6 +52,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
const LLT v2s64 = LLT::vector(2, 64);
const LLT v2p0 = LLT::vector(2, p0);
+ // FIXME: support subtargets which have neon/fp-armv8 disabled.
+ if (!ST.hasNEON() || !ST.hasFPARMv8()) {
+ computeTables();
+ return;
+ }
+
getActionDefinitionsBuilder(G_IMPLICIT_DEF)
.legalFor({p0, s1, s8, s16, s32, s64, v4s32, v2s64})
.clampScalar(0, s1, s64)
@@ -74,7 +82,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
getActionDefinitionsBuilder(G_BSWAP)
.legalFor({s32, s64, v4s32, v2s32, v2s64})
- .clampScalar(0, s16, s64)
+ .clampScalar(0, s32, s64)
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
@@ -104,6 +112,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
getActionDefinitionsBuilder({G_SDIV, G_UDIV})
.legalFor({s32, s64})
+ .libcallFor({s128})
.clampScalar(0, s32, s64)
.widenScalarToNextPow2(0)
.scalarize(0);
@@ -115,8 +124,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
AmtTy.getSizeInBits() == 32;
})
- .legalFor(
- {{s32, s32}, {s32, s64}, {s64, s64}, {v2s32, v2s32}, {v4s32, v4s32}})
+ .legalFor({{s32, s32},
+ {s32, s64},
+ {s64, s64},
+ {v2s32, v2s32},
+ {v4s32, v4s32},
+ {v2s64, v2s64}})
.clampScalar(1, s32, s64)
.clampScalar(0, s32, s64)
.minScalarSameAs(1, 0);
@@ -191,14 +204,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
.legalIf([=](const LegalityQuery &Query) {
const LLT &Ty0 = Query.Types[0];
const LLT &Ty1 = Query.Types[1];
- if (Ty1 != s32 && Ty1 != s64)
+ if (Ty1 != s32 && Ty1 != s64 && Ty1 != s128)
return false;
if (Ty1 == p0)
return true;
return isPowerOf2_32(Ty0.getSizeInBits()) &&
(Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8);
})
- .clampScalar(1, s32, s64)
+ .clampScalar(1, s32, s128)
.widenScalarToNextPow2(1)
.maxScalarIf(typeInSet(1, {s32}), 0, s16)
.maxScalarIf(typeInSet(1, {s64}), 0, s32)
@@ -236,6 +249,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
{s32, p0, 32, 8},
{s64, p0, 64, 8},
{p0, p0, 64, 8},
+ {s128, p0, 128, 8},
{v8s8, p0, 64, 8},
{v16s8, p0, 128, 8},
{v4s16, p0, 64, 8},
@@ -247,14 +261,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
.legalForTypesWithMemDesc({{s32, p0, 8, 8},
{s32, p0, 16, 8}})
.clampScalar(0, s8, s64)
- .widenScalarToNextPow2(0)
- // TODO: We could support sum-of-pow2's but the lowering code doesn't know
- // how to do that yet.
- .unsupportedIfMemSizeNotPow2()
+ .lowerIfMemSizeNotPow2()
// Lower any any-extending loads left into G_ANYEXT and G_LOAD
.lowerIf([=](const LegalityQuery &Query) {
return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
})
+ .widenScalarToNextPow2(0)
.clampMaxNumElements(0, s32, 2)
.clampMaxNumElements(0, s64, 1)
.customIf(IsPtrVecPred);
@@ -262,9 +274,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
getActionDefinitionsBuilder(G_STORE)
.legalForTypesWithMemDesc({{s8, p0, 8, 8},
{s16, p0, 16, 8},
+ {s32, p0, 8, 8},
+ {s32, p0, 16, 8},
{s32, p0, 32, 8},
{s64, p0, 64, 8},
{p0, p0, 64, 8},
+ {s128, p0, 128, 8},
{v16s8, p0, 128, 8},
{v4s16, p0, 64, 8},
{v8s16, p0, 128, 8},
@@ -272,10 +287,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
{v4s32, p0, 128, 8},
{v2s64, p0, 128, 8}})
.clampScalar(0, s8, s64)
- .widenScalarToNextPow2(0)
- // TODO: We could support sum-of-pow2's but the lowering code doesn't know
- // how to do that yet.
- .unsupportedIfMemSizeNotPow2()
+ .lowerIfMemSizeNotPow2()
.lowerIf([=](const LegalityQuery &Query) {
return Query.Types[0].isScalar() &&
Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
@@ -305,8 +317,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
{v8s16, v8s16},
{v8s8, v8s8},
{v16s8, v16s8}})
- .clampScalar(0, s32, s32)
.clampScalar(1, s32, s64)
+ .clampScalar(0, s32, s32)
.minScalarEltSameAsIf(
[=](const LegalityQuery &Query) {
const LLT &Ty = Query.Types[0];
@@ -330,33 +342,40 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
.widenScalarToNextPow2(1);
// Extensions
- getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
- .legalIf([=](const LegalityQuery &Query) {
- unsigned DstSize = Query.Types[0].getSizeInBits();
-
- // Make sure that we have something that will fit in a register, and
- // make sure it's a power of 2.
- if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
- return false;
+ auto ExtLegalFunc = [=](const LegalityQuery &Query) {
+ unsigned DstSize = Query.Types[0].getSizeInBits();
+
+ if (DstSize == 128 && !Query.Types[0].isVector())
+ return false; // Extending to a scalar s128 needs narrowing.
+
+ // Make sure that we have something that will fit in a register, and
+ // make sure it's a power of 2.
+ if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
+ return false;
- const LLT &SrcTy = Query.Types[1];
+ const LLT &SrcTy = Query.Types[1];
- // Special case for s1.
- if (SrcTy == s1)
- return true;
+ // Special case for s1.
+ if (SrcTy == s1)
+ return true;
- // Make sure we fit in a register otherwise. Don't bother checking that
- // the source type is below 128 bits. We shouldn't be allowing anything
- // through which is wider than the destination in the first place.
- unsigned SrcSize = SrcTy.getSizeInBits();
- if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
- return false;
+ // Make sure we fit in a register otherwise. Don't bother checking that
+ // the source type is below 128 bits. We shouldn't be allowing anything
+ // through which is wider than the destination in the first place.
+ unsigned SrcSize = SrcTy.getSizeInBits();
+ if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
+ return false;
- return true;
- });
+ return true;
+ };
+ getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
+ .legalIf(ExtLegalFunc)
+ .clampScalar(0, s64, s64); // Just for s128, others are handled above.
getActionDefinitionsBuilder(G_TRUNC).alwaysLegal();
+ getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+
// FP conversions
getActionDefinitionsBuilder(G_FPTRUNC).legalFor(
{{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}});
@@ -591,6 +610,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
return Query.Types[0] == p0 && Query.Types[1] == s64;
});
+ getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
+
computeTables();
verify(*ST.getInstrInfo());
}
@@ -617,6 +638,24 @@ bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
llvm_unreachable("expected switch to return");
}
+bool AArch64LegalizerInfo::legalizeIntrinsic(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ switch (MI.getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ if (createMemLibcall(MIRBuilder, MRI, MI) ==
+ LegalizerHelper::UnableToLegalize)
+ return false;
+ MI.eraseFromParent();
+ return true;
+ default:
+ break;
+ }
+ return true;
+}
+
bool AArch64LegalizerInfo::legalizeShlAshrLshr(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer) const {
@@ -655,7 +694,7 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
// legalized. In order to allow further legalization of the inst, we create
// a new instruction and erase the existing one.
- unsigned ValReg = MI.getOperand(0).getReg();
+ Register ValReg = MI.getOperand(0).getReg();
const LLT ValTy = MRI.getType(ValReg);
if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
@@ -672,7 +711,7 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
auto Bitcast = MIRBuilder.buildBitcast({NewTy}, {ValReg});
MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1).getReg(), MMO);
} else {
- unsigned NewReg = MRI.createGenericVirtualRegister(NewTy);
+ Register NewReg = MRI.createGenericVirtualRegister(NewTy);
auto NewLoad = MIRBuilder.buildLoad(NewReg, MI.getOperand(1).getReg(), MMO);
MIRBuilder.buildBitcast({ValReg}, {NewLoad});
}
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.h b/lib/Target/AArch64/AArch64LegalizerInfo.h
index f3362a18620f..15161bab466c 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.h
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -31,6 +31,9 @@ public:
MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer) const override;
+ bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const override;
+
private:
bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder) const;
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 65b5f906e3f6..a0c4a25bb5b9 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -201,8 +201,22 @@ static bool isNarrowStore(unsigned Opc) {
}
}
+// These instruction set memory tag and either keep memory contents unchanged or
+// set it to zero, ignoring the address part of the source register.
+static bool isTagStore(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::STGOffset:
+ case AArch64::STZGOffset:
+ case AArch64::ST2GOffset:
+ case AArch64::STZ2GOffset:
+ return true;
+ }
+}
+
// Scaling factor for unscaled load or store.
-static int getMemScale(MachineInstr &MI) {
+static int getMemScale(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
llvm_unreachable("Opcode has unknown scale!");
@@ -255,6 +269,11 @@ static int getMemScale(MachineInstr &MI) {
case AArch64::STURQi:
case AArch64::LDPQi:
case AArch64::STPQi:
+ case AArch64::STGOffset:
+ case AArch64::STZGOffset:
+ case AArch64::ST2GOffset:
+ case AArch64::STZ2GOffset:
+ case AArch64::STGPi:
return 16;
}
}
@@ -449,6 +468,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
return AArch64::STPWpre;
case AArch64::STPXi:
return AArch64::STPXpre;
+ case AArch64::STGOffset:
+ return AArch64::STGPreIndex;
+ case AArch64::STZGOffset:
+ return AArch64::STZGPreIndex;
+ case AArch64::ST2GOffset:
+ return AArch64::ST2GPreIndex;
+ case AArch64::STZ2GOffset:
+ return AArch64::STZ2GPreIndex;
+ case AArch64::STGPi:
+ return AArch64::STGPpre;
}
}
@@ -518,6 +547,16 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
return AArch64::STPWpost;
case AArch64::STPXi:
return AArch64::STPXpost;
+ case AArch64::STGOffset:
+ return AArch64::STGPostIndex;
+ case AArch64::STZGOffset:
+ return AArch64::STZGPostIndex;
+ case AArch64::ST2GOffset:
+ return AArch64::ST2GPostIndex;
+ case AArch64::STZ2GOffset:
+ return AArch64::STZ2GPostIndex;
+ case AArch64::STGPi:
+ return AArch64::STGPpost;
}
}
@@ -536,10 +575,30 @@ static bool isPairedLdSt(const MachineInstr &MI) {
case AArch64::STPQi:
case AArch64::STPWi:
case AArch64::STPXi:
+ case AArch64::STGPi:
return true;
}
}
+// Returns the scale and offset range of pre/post indexed variants of MI.
+static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
+ int &MinOffset, int &MaxOffset) {
+ bool IsPaired = isPairedLdSt(MI);
+ bool IsTagStore = isTagStore(MI);
+ // ST*G and all paired ldst have the same scale in pre/post-indexed variants
+ // as in the "unsigned offset" variant.
+ // All other pre/post indexed ldst instructions are unscaled.
+ Scale = (IsTagStore || IsPaired) ? getMemScale(MI) : 1;
+
+ if (IsPaired) {
+ MinOffset = -64;
+ MaxOffset = 63;
+ } else {
+ MinOffset = -256;
+ MaxOffset = 255;
+ }
+}
+
static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
unsigned PairedRegOp = 0) {
assert(PairedRegOp < 2 && "Unexpected register operand idx.");
@@ -618,6 +677,11 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
case AArch64::LDRWui:
case AArch64::LDRHHui:
case AArch64::LDRBBui:
+ case AArch64::STGOffset:
+ case AArch64::STZGOffset:
+ case AArch64::ST2GOffset:
+ case AArch64::STZ2GOffset:
+ case AArch64::STGPi:
// Unscaled instructions.
case AArch64::STURSi:
case AArch64::STURDi:
@@ -808,7 +872,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
// STRWui %w1, ...
// USE kill %w1 ; need to clear kill flag when moving STRWui downwards
// STRW %w0
- unsigned Reg = getLdStRegOp(*I).getReg();
+ Register Reg = getLdStRegOp(*I).getReg();
for (MachineInstr &MI : make_range(std::next(I), Paired))
MI.clearRegisterKills(Reg, TRI);
}
@@ -837,9 +901,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
MachineOperand &DstMO = MIB->getOperand(SExtIdx);
// Right now, DstMO has the extended register, since it comes from an
// extended opcode.
- unsigned DstRegX = DstMO.getReg();
+ Register DstRegX = DstMO.getReg();
// Get the W variant of that register.
- unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
+ Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
// Update the result of LDP to use the W instead of the X variant.
DstMO.setReg(DstRegW);
LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
@@ -882,9 +946,9 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
int LoadSize = getMemScale(*LoadI);
int StoreSize = getMemScale(*StoreI);
- unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+ Register LdRt = getLdStRegOp(*LoadI).getReg();
const MachineOperand &StMO = getLdStRegOp(*StoreI);
- unsigned StRt = getLdStRegOp(*StoreI).getReg();
+ Register StRt = getLdStRegOp(*StoreI).getReg();
bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
assert((IsStoreXReg ||
@@ -933,10 +997,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
? getLdStOffsetOp(*StoreI).getImm()
: getLdStOffsetOp(*StoreI).getImm() * StoreSize;
int Width = LoadSize * 8;
- unsigned DestReg = IsStoreXReg
- ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
- &AArch64::GPR64RegClass)
- : LdRt;
+ unsigned DestReg =
+ IsStoreXReg ? Register(TRI->getMatchingSuperReg(
+ LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
+ : LdRt;
assert((UnscaledLdOffset >= UnscaledStOffset &&
(UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
@@ -1042,7 +1106,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
MachineBasicBlock::iterator B = I->getParent()->begin();
MachineBasicBlock::iterator MBBI = I;
MachineInstr &LoadMI = *I;
- unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
+ Register BaseReg = getLdStBaseOp(LoadMI).getReg();
// If the load is the first instruction in the block, there's obviously
// not any matching store.
@@ -1156,8 +1220,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
bool MayLoad = FirstMI.mayLoad();
bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
- unsigned Reg = getLdStRegOp(FirstMI).getReg();
- unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+ Register Reg = getLdStRegOp(FirstMI).getReg();
+ Register BaseReg = getLdStBaseOp(FirstMI).getReg();
int Offset = getLdStOffsetOp(FirstMI).getImm();
int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
@@ -1188,7 +1252,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// check for +1/-1. Make sure to check the new instruction offset is
// actually an immediate and not a symbolic reference destined for
// a relocation.
- unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
+ Register MIBaseReg = getLdStBaseOp(MI).getReg();
int MIOffset = getLdStOffsetOp(MI).getImm();
bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
if (IsUnscaled != MIIsUnscaled) {
@@ -1328,18 +1392,19 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
: getPostIndexedOpcode(I->getOpcode());
MachineInstrBuilder MIB;
+ int Scale, MinOffset, MaxOffset;
+ getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
if (!isPairedLdSt(*I)) {
// Non-paired instruction.
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
.add(getLdStRegOp(*Update))
.add(getLdStRegOp(*I))
.add(getLdStBaseOp(*I))
- .addImm(Value)
+ .addImm(Value / Scale)
.setMemRefs(I->memoperands())
.setMIFlags(I->mergeFlagsWith(*Update));
} else {
// Paired instruction.
- int Scale = getMemScale(*I);
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
.add(getLdStRegOp(*Update))
.add(getLdStRegOp(*I, 0))
@@ -1395,28 +1460,21 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
MI.getOperand(1).getReg() != BaseReg)
break;
- bool IsPairedInsn = isPairedLdSt(MemMI);
int UpdateOffset = MI.getOperand(2).getImm();
if (MI.getOpcode() == AArch64::SUBXri)
UpdateOffset = -UpdateOffset;
- // For non-paired load/store instructions, the immediate must fit in a
- // signed 9-bit integer.
- if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
+ // The immediate must be a multiple of the scaling factor of the pre/post
+ // indexed instruction.
+ int Scale, MinOffset, MaxOffset;
+ getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
+ if (UpdateOffset % Scale != 0)
break;
- // For paired load/store instructions, the immediate must be a multiple of
- // the scaling factor. The scaled offset must also fit into a signed 7-bit
- // integer.
- if (IsPairedInsn) {
- int Scale = getMemScale(MemMI);
- if (UpdateOffset % Scale != 0)
- break;
-
- int ScaledOffset = UpdateOffset / Scale;
- if (ScaledOffset > 63 || ScaledOffset < -64)
- break;
- }
+ // Scaled offset must fit in the instruction immediate.
+ int ScaledOffset = UpdateOffset / Scale;
+ if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
+ break;
// If we have a non-zero Offset, we check that it matches the amount
// we're adding to the register.
@@ -1433,7 +1491,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
MachineInstr &MemMI = *I;
MachineBasicBlock::iterator MBBI = I;
- unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+ Register BaseReg = getLdStBaseOp(MemMI).getReg();
int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
// Scan forward looking for post-index opportunities. Updating instructions
@@ -1442,13 +1500,19 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
if (MIUnscaledOffset != UnscaledOffset)
return E;
- // If the base register overlaps a destination register, we can't
- // merge the update.
- bool IsPairedInsn = isPairedLdSt(MemMI);
- for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
- unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
- if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
- return E;
+ // If the base register overlaps a source/destination register, we can't
+ // merge the update. This does not apply to tag store instructions which
+ // ignore the address part of the source register.
+ // This does not apply to STGPi as well, which does not have unpredictable
+ // behavior in this case unlike normal stores, and always performs writeback
+ // after reading the source register value.
+ if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+ Register DestReg = getLdStRegOp(MemMI, i).getReg();
+ if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ return E;
+ }
}
// Track which register units have been modified and used between the first
@@ -1487,7 +1551,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
MachineInstr &MemMI = *I;
MachineBasicBlock::iterator MBBI = I;
- unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+ Register BaseReg = getLdStBaseOp(MemMI).getReg();
int Offset = getLdStOffsetOp(MemMI).getImm();
// If the load/store is the first instruction in the block, there's obviously
@@ -1496,11 +1560,13 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return E;
// If the base register overlaps a destination register, we can't
// merge the update.
- bool IsPairedInsn = isPairedLdSt(MemMI);
- for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
- unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
- if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
- return E;
+ if (!isTagStore(MemMI)) {
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+ Register DestReg = getLdStRegOp(MemMI, i).getReg();
+ if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ return E;
+ }
}
// Track which register units have been modified and used between the first
@@ -1659,7 +1725,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
// however, is not, so adjust here.
int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
- // Look forward to try to find a post-index instruction. For example,
+ // Look forward to try to find a pre-index instruction. For example,
// ldr x1, [x0, #64]
// add x0, x0, #64
// merged into:
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index e7d4a2789a28..afd5ae6bcbf2 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -148,6 +148,8 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
RefFlags |= AArch64MCExpr::VK_TLSDESC;
break;
}
+ } else if (MO.getTargetFlags() & AArch64II::MO_PREL) {
+ RefFlags |= AArch64MCExpr::VK_PREL;
} else {
// No modifier means this is a generic reference, classified as absolute for
// the cases where it matters (:abs_g0: etc).
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 0efeeb272ec1..0009fb7b5520 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -19,6 +19,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Function.h"
#include "llvm/MC/MCLinkerOptimizationHint.h"
#include <cassert>
@@ -95,6 +96,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// returned struct in a register. This field holds the virtual register into
/// which the sret argument is passed.
unsigned SRetReturnReg = 0;
+ /// SVE stack size (for predicates and data vectors) are maintained here
+ /// rather than in FrameInfo, as the placement and Stack IDs are target
+ /// specific.
+ uint64_t StackSizeSVE = 0;
+
+ /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid.
+ bool HasCalculatedStackSizeSVE = false;
/// Has a value when it is known whether or not the function uses a
/// redzone, and no value otherwise.
@@ -131,6 +139,15 @@ public:
ArgumentStackToRestore = bytes;
}
+ bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
+
+ void setStackSizeSVE(uint64_t S) {
+ HasCalculatedStackSizeSVE = true;
+ StackSizeSVE = S;
+ }
+
+ uint64_t getStackSizeSVE() const { return StackSizeSVE; }
+
bool hasStackFrame() const { return HasStackFrame; }
void setHasStackFrame(bool s) { HasStackFrame = s; }
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index aff861aae6be..d503c39b1f90 100644
--- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -162,11 +162,11 @@ bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd,
LiveIntervals &LIs = G.getMetadata().LIS;
- if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) {
- LLVM_DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd)
- << '\n');
- LLVM_DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra)
- << '\n');
+ if (Register::isPhysicalRegister(Rd) || Register::isPhysicalRegister(Ra)) {
+ LLVM_DEBUG(dbgs() << "Rd is a physical reg:"
+ << Register::isPhysicalRegister(Rd) << '\n');
+ LLVM_DEBUG(dbgs() << "Ra is a physical reg:"
+ << Register::isPhysicalRegister(Ra) << '\n');
return false;
}
@@ -359,8 +359,8 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) {
case AArch64::FMADDDrrr:
case AArch64::FNMSUBDrrr:
case AArch64::FNMADDDrrr: {
- unsigned Rd = MI.getOperand(0).getReg();
- unsigned Ra = MI.getOperand(3).getReg();
+ Register Rd = MI.getOperand(0).getReg();
+ Register Ra = MI.getOperand(3).getReg();
if (addIntraChainConstraint(G, Rd, Ra))
addInterChainConstraint(G, Rd, Ra);
@@ -369,7 +369,7 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) {
case AArch64::FMLAv2f32:
case AArch64::FMLSv2f32: {
- unsigned Rd = MI.getOperand(0).getReg();
+ Register Rd = MI.getOperand(0).getReg();
addInterChainConstraint(G, Rd, Rd);
break;
}
diff --git a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
index 5f7245bfbd74..d30ea120bae4 100644
--- a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
+++ b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
@@ -15,7 +15,9 @@
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Support/Debug.h"
@@ -25,12 +27,31 @@
using namespace llvm;
using namespace MIPatternMatch;
+#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AArch64GenGICombiner.inc"
+#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
namespace {
+#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AArch64GenGICombiner.inc"
+#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+
class AArch64PreLegalizerCombinerInfo : public CombinerInfo {
+ GISelKnownBits *KB;
+ MachineDominatorTree *MDT;
+
public:
- AArch64PreLegalizerCombinerInfo()
+ AArch64GenPreLegalizerCombinerHelper Generated;
+
+ AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+ GISelKnownBits *KB, MachineDominatorTree *MDT)
: CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
- /*LegalizerInfo*/ nullptr) {}
+ /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
+ KB(KB), MDT(MDT) {
+ if (!Generated.parseCommandLineOption())
+ report_fatal_error("Invalid rule identifier");
+ }
+
virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
MachineIRBuilder &B) const override;
};
@@ -38,24 +59,50 @@ public:
bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
MachineInstr &MI,
MachineIRBuilder &B) const {
- CombinerHelper Helper(Observer, B);
+ CombinerHelper Helper(Observer, B, KB, MDT);
switch (MI.getOpcode()) {
- default:
- return false;
- case TargetOpcode::COPY:
- return Helper.tryCombineCopy(MI);
- case TargetOpcode::G_BR:
- return Helper.tryCombineBr(MI);
+ case TargetOpcode::G_CONCAT_VECTORS:
+ return Helper.tryCombineConcatVectors(MI);
+ case TargetOpcode::G_SHUFFLE_VECTOR:
+ return Helper.tryCombineShuffleVector(MI);
case TargetOpcode::G_LOAD:
case TargetOpcode::G_SEXTLOAD:
- case TargetOpcode::G_ZEXTLOAD:
- return Helper.tryCombineExtendingLoads(MI);
+ case TargetOpcode::G_ZEXTLOAD: {
+ bool Changed = false;
+ Changed |= Helper.tryCombineExtendingLoads(MI);
+ Changed |= Helper.tryCombineIndexedLoadStore(MI);
+ return Changed;
+ }
+ case TargetOpcode::G_STORE:
+ return Helper.tryCombineIndexedLoadStore(MI);
+ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+ switch (MI.getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove:
+ case Intrinsic::memset: {
+ // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
+ // heuristics decide.
+ unsigned MaxLen = EnableOpt ? 0 : 32;
+ // Try to inline memcpy type calls if optimizations are enabled.
+ return (!EnableMinSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen)
+ : false;
+ }
+ default:
+ break;
+ }
}
+ if (Generated.tryCombineAll(Observer, MI, B))
+ return true;
+
return false;
}
+#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AArch64GenGICombiner.inc"
+#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
// Pass boilerplate
// ================
@@ -63,24 +110,33 @@ class AArch64PreLegalizerCombiner : public MachineFunctionPass {
public:
static char ID;
- AArch64PreLegalizerCombiner();
+ AArch64PreLegalizerCombiner(bool IsOptNone = false);
StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; }
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
+private:
+ bool IsOptNone;
};
-}
+} // end anonymous namespace
void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetPassConfig>();
AU.setPreservesCFG();
getSelectionDAGFallbackAnalysisUsage(AU);
+ AU.addRequired<GISelKnownBitsAnalysis>();
+ AU.addPreserved<GISelKnownBitsAnalysis>();
+ if (!IsOptNone) {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ }
MachineFunctionPass::getAnalysisUsage(AU);
}
-AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner() : MachineFunctionPass(ID) {
+AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner(bool IsOptNone)
+ : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
}
@@ -89,7 +145,14 @@ bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
MachineFunctionProperties::Property::FailedISel))
return false;
auto *TPC = &getAnalysis<TargetPassConfig>();
- AArch64PreLegalizerCombinerInfo PCInfo;
+ const Function &F = MF.getFunction();
+ bool EnableOpt =
+ MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+ GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+ MachineDominatorTree *MDT =
+ IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+ F.hasMinSize(), KB, MDT);
Combiner C(PCInfo, TPC);
return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
}
@@ -99,13 +162,14 @@ INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE,
"Combine AArch64 machine instrs before legalization",
false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
"Combine AArch64 machine instrs before legalization", false,
false)
namespace llvm {
-FunctionPass *createAArch64PreLegalizeCombiner() {
- return new AArch64PreLegalizerCombiner();
+FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone) {
+ return new AArch64PreLegalizerCombiner(IsOptNone);
}
} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index b52259cc9acd..8ec73aa3c040 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -563,12 +563,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
return getSameKindOfOperandsMapping(MI);
}
case TargetOpcode::COPY: {
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
// Check if one of the register is not a generic register.
- if ((TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+ if ((Register::isPhysicalRegister(DstReg) ||
!MRI.getType(DstReg).isValid()) ||
- (TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
+ (Register::isPhysicalRegister(SrcReg) ||
!MRI.getType(SrcReg).isValid())) {
const RegisterBank *DstRB = getRegBank(DstReg, MRI, TRI);
const RegisterBank *SrcRB = getRegBank(SrcReg, MRI, TRI);
@@ -635,6 +635,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Some of the floating-point instructions have mixed GPR and FPR operands:
// fine-tune the computed mapping.
switch (Opc) {
+ case TargetOpcode::G_TRUNC: {
+ LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+ if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128)
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+ break;
+ }
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_UITOFP:
if (MRI.getType(MI.getOperand(0).getReg()).isVector())
@@ -687,7 +693,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case TargetOpcode::G_STORE:
// Check if that store is fed by fp instructions.
if (OpRegBankIdx[0] == PMI_FirstGPR) {
- unsigned VReg = MI.getOperand(0).getReg();
+ Register VReg = MI.getOperand(0).getReg();
if (!VReg)
break;
MachineInstr *DefMI = MRI.getVRegDef(VReg);
@@ -702,11 +708,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
// If we're taking in vectors, we have no choice but to put everything on
- // FPRs.
+ // FPRs, except for the condition. The condition must always be on a GPR.
LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
if (SrcTy.isVector()) {
- for (unsigned Idx = 0; Idx < 4; ++Idx)
- OpRegBankIdx[Idx] = PMI_FirstFPR;
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR};
break;
}
@@ -740,7 +745,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// This doesn't check the condition, since it's just whatever is in NZCV.
// This isn't passed explicitly in a register to fcsel/csel.
for (unsigned Idx = 2; Idx < 4; ++Idx) {
- unsigned VReg = MI.getOperand(Idx).getReg();
+ Register VReg = MI.getOperand(Idx).getReg();
MachineInstr *DefMI = MRI.getVRegDef(VReg);
if (getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank ||
onlyDefinesFP(*DefMI, MRI, TRI))
@@ -750,8 +755,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// If we have more FP constraints than not, then move everything over to
// FPR.
if (NumFP >= 2)
- for (unsigned Idx = 0; Idx < 4; ++Idx)
- OpRegBankIdx[Idx] = PMI_FirstFPR;
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR};
break;
}
@@ -764,7 +768,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
LLT SrcTy = MRI.getType(MI.getOperand(MI.getNumOperands()-1).getReg());
// UNMERGE into scalars from a vector should always use FPR.
// Likewise if any of the uses are FP instructions.
- if (SrcTy.isVector() ||
+ if (SrcTy.isVector() || SrcTy == LLT::scalar(128) ||
any_of(MRI.use_instructions(MI.getOperand(0).getReg()),
[&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) {
// Set the register bank of every operand to FPR.
@@ -795,12 +799,21 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Index needs to be a GPR.
OpRegBankIdx[3] = PMI_FirstGPR;
break;
+ case TargetOpcode::G_EXTRACT: {
+ // For s128 sources we have to use fpr.
+ LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+ if (SrcTy.getSizeInBits() == 128) {
+ OpRegBankIdx[0] = PMI_FirstFPR;
+ OpRegBankIdx[1] = PMI_FirstFPR;
+ }
+ break;
+ }
case TargetOpcode::G_BUILD_VECTOR:
// If the first source operand belongs to a FPR register bank, then make
// sure that we preserve that.
if (OpRegBankIdx[1] != PMI_FirstGPR)
break;
- unsigned VReg = MI.getOperand(1).getReg();
+ Register VReg = MI.getOperand(1).getReg();
if (!VReg)
break;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 6d5a4e3d2f76..de176088595d 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -15,6 +15,7 @@
#include "AArch64FrameLowering.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
+#include "AArch64StackOffset.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/BitVector.h"
@@ -23,10 +24,10 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
@@ -63,8 +64,9 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return CSR_AArch64_AAPCS_SwiftError_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
return CSR_AArch64_RT_MostRegs_SaveList;
- else
- return CSR_AArch64_AAPCS_SaveList;
+ if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
+ return CSR_Darwin_AArch64_AAPCS_SaveList;
+ return CSR_AArch64_AAPCS_SaveList;
}
const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
@@ -120,6 +122,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
: CSR_AArch64_CXX_TLS_Darwin_RegMask;
if (CC == CallingConv::AArch64_VectorCall)
return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask;
+ if (CC == CallingConv::AArch64_SVE_VectorCall)
+ return CSR_AArch64_SVE_AAPCS_RegMask;
if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
->supportSwiftError() &&
MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
@@ -388,7 +392,7 @@ bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
int64_t Offset) const {
assert(Offset <= INT_MAX && "Offset too big to fit in int.");
assert(MI && "Unable to get the legal offset for nil instruction.");
- int SaveOffset = Offset;
+ StackOffset SaveOffset(Offset, MVT::i8);
return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
}
@@ -418,7 +422,9 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
int64_t Offset) const {
- int Off = Offset; // ARM doesn't need the general 64-bit offsets
+ // ARM doesn't need the general 64-bit offsets
+ StackOffset Off(Offset, MVT::i8);
+
unsigned i = 0;
while (!MI.getOperand(i).isFI()) {
@@ -441,40 +447,69 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MachineInstr &MI = *II;
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64InstrInfo *TII =
MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
const AArch64FrameLowering *TFI = getFrameLowering(MF);
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ bool Tagged =
+ MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED;
unsigned FrameReg;
- int Offset;
// Special handling of dbg_value, stackmap and patchpoint instructions.
if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
MI.getOpcode() == TargetOpcode::PATCHPOINT) {
- Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
- /*PreferFP=*/true,
- /*ForSimm=*/false);
- Offset += MI.getOperand(FIOperandNum + 1).getImm();
+ StackOffset Offset =
+ TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+ /*PreferFP=*/true,
+ /*ForSimm=*/false);
+ Offset += StackOffset(MI.getOperand(FIOperandNum + 1).getImm(), MVT::i8);
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
- MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getBytes());
return;
}
if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) {
MachineOperand &FI = MI.getOperand(FIOperandNum);
- Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
+ int Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
FI.ChangeToImmediate(Offset);
return;
}
+ StackOffset Offset;
if (MI.getOpcode() == AArch64::TAGPstack) {
// TAGPstack must use the virtual frame register in its 3rd operand.
- const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
FrameReg = MI.getOperand(3).getReg();
- Offset =
- MFI.getObjectOffset(FrameIndex) + AFI->getTaggedBasePointerOffset();
+ Offset = {MFI.getObjectOffset(FrameIndex) +
+ AFI->getTaggedBasePointerOffset(),
+ MVT::i8};
+ } else if (Tagged) {
+ StackOffset SPOffset = {
+ MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), MVT::i8};
+ if (MFI.hasVarSizedObjects() ||
+ isAArch64FrameOffsetLegal(MI, SPOffset, nullptr, nullptr, nullptr) !=
+ (AArch64FrameOffsetCanUpdate | AArch64FrameOffsetIsLegal)) {
+ // Can't update to SP + offset in place. Precalculate the tagged pointer
+ // in a scratch register.
+ Offset = TFI->resolveFrameIndexReference(
+ MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
+ Register ScratchReg =
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset,
+ TII);
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg)
+ .addReg(ScratchReg)
+ .addReg(ScratchReg)
+ .addImm(0);
+ MI.getOperand(FIOperandNum)
+ .ChangeToRegister(ScratchReg, false, false, true);
+ return;
+ }
+ FrameReg = AArch64::SP;
+ Offset = {MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(),
+ MVT::i8};
} else {
Offset = TFI->resolveFrameIndexReference(
MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
@@ -490,7 +525,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// If we get here, the immediate doesn't fit into the instruction. We folded
// as much as possible above. Handle the rest, providing a register that is
// SP+LargeImm.
- unsigned ScratchReg =
+ Register ScratchReg =
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
diff --git a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index 854670079e40..28a7e680849b 100644
--- a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -426,16 +426,16 @@ bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
// Get the operands of the current SIMD arithmetic instruction.
- unsigned MulDest = MI.getOperand(0).getReg();
- unsigned SrcReg0 = MI.getOperand(1).getReg();
+ Register MulDest = MI.getOperand(0).getReg();
+ Register SrcReg0 = MI.getOperand(1).getReg();
unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
- unsigned SrcReg1 = MI.getOperand(2).getReg();
+ Register SrcReg1 = MI.getOperand(2).getReg();
unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
unsigned DupDest;
// Instructions of interest have either 4 or 5 operands.
if (MI.getNumOperands() == 5) {
- unsigned SrcReg2 = MI.getOperand(3).getReg();
+ Register SrcReg2 = MI.getOperand(3).getReg();
unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
unsigned LaneNumber = MI.getOperand(4).getImm();
// Create a new DUP instruction. Note that if an equivalent DUP instruction
diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 79ab42f4c080..b573eac76754 100644
--- a/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -82,11 +82,11 @@ let Predicates = [HasSVE] in {
defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr">;
defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr">;
- defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot">;
- defm UDOT_ZZZ : sve_intx_dot<0b1, "udot">;
+ defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>;
+ defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>;
- defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot">;
- defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot">;
+ defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
+ defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;
defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb">;
defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb">;
@@ -94,14 +94,14 @@ let Predicates = [HasSVE] in {
defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth">;
defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw">;
defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw">;
- defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs">;
- defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg">;
-
- defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls">;
- defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz">;
- defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt">;
- defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot">;
- defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not">;
+ defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>;
+ defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>;
+
+ defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", null_frag>;
+ defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", null_frag>;
+ defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>;
+ defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", null_frag>;
+ defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", null_frag>;
defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">;
defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">;
@@ -138,12 +138,12 @@ let Predicates = [HasSVE] in {
defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">;
defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">;
- defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd">;
- defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub">;
- defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul">;
- defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul">;
- defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps">;
- defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">;
+ defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>;
+ defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", null_frag>;
+ defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", null_frag>;
+ defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul", null_frag>;
+ defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", null_frag>;
+ defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", null_frag>;
defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">;
@@ -187,7 +187,7 @@ let Predicates = [HasSVE] in {
defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;
// Splat scalar register (unpredicated, GPR or vector + element index)
- defm DUP_ZR : sve_int_perm_dup_r<"dup">;
+ defm DUP_ZR : sve_int_perm_dup_r<"dup", AArch64dup>;
defm DUP_ZZI : sve_int_perm_dup_i<"dup">;
// Splat scalar register (predicated)
@@ -211,13 +211,13 @@ let Predicates = [HasSVE] in {
defm REV_PP : sve_int_perm_reverse_p<"rev">;
defm REV_ZZ : sve_int_perm_reverse_z<"rev">;
- defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo">;
- defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi">;
- defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo">;
- defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi">;
+ defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
+ defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
+ defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>;
+ defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi", AArch64uunpkhi>;
- def PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo">;
- def PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi">;
+ defm PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo", int_aarch64_sve_punpklo>;
+ defm PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi", int_aarch64_sve_punpkhi>;
defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
@@ -1020,6 +1020,56 @@ let Predicates = [HasSVE] in {
(FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
(FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+
+ def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+ def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+
+ def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+ def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+
+ def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+ def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+
+ def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+ def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+
+ def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+ def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+
+ def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+ def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+
+ def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+
}
let Predicates = [HasSVE2] in {
@@ -1164,6 +1214,13 @@ let Predicates = [HasSVE2] in {
defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">;
defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">;
+ // SVE2 predicated shifts
+ defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
+ defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
+ defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
+ defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
+ defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
+
// SVE2 integer add/subtract long
defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">;
defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">;
@@ -1199,14 +1256,14 @@ let Predicates = [HasSVE2] in {
defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt">;
// SVE2 bitwise shift and insert
- defm SRI_ZZI : sve2_int_bin_cons_shift_imm_right<0b0, "sri">;
- defm SLI_ZZI : sve2_int_bin_cons_shift_imm_left< 0b1, "sli">;
+ defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri">;
+ defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli">;
// SVE2 bitwise shift right and accumulate
- defm SSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b00, "ssra">;
- defm USRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b01, "usra">;
- defm SRSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b10, "srsra">;
- defm URSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b11, "ursra">;
+ defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra">;
+ defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra">;
+ defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra">;
+ defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra">;
// SVE2 complex integer add
defm CADD_ZZI : sve2_int_cadd<0b0, "cadd">;
@@ -1228,41 +1285,47 @@ let Predicates = [HasSVE2] in {
defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">;
defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">;
- // SVE2 bitwise shift right narrow
- defm SQSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0000, "sqshrunb">;
- defm SQSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0001, "sqshrunt">;
- defm SQRSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0010, "sqrshrunb">;
- defm SQRSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0011, "sqrshrunt">;
- defm SHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0100, "shrnb">;
- defm SHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0101, "shrnt">;
- defm RSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0110, "rshrnb">;
- defm RSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0111, "rshrnt">;
- defm SQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1000, "sqshrnb">;
- defm SQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1001, "sqshrnt">;
- defm SQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1010, "sqrshrnb">;
- defm SQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1011, "sqrshrnt">;
- defm UQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1100, "uqshrnb">;
- defm UQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1101, "uqshrnt">;
- defm UQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1110, "uqrshrnb">;
- defm UQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1111, "uqrshrnt">;
-
- // SVE2 integer add/subtract narrow high part
- defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b000, "addhnb">;
- defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b001, "addhnt">;
- defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b010, "raddhnb">;
- defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b011, "raddhnt">;
- defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b100, "subhnb">;
- defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b101, "subhnt">;
- defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b110, "rsubhnb">;
- defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b111, "rsubhnt">;
-
- // SVE2 saturating extract narrow
- defm SQXTNB_ZZ : sve2_int_sat_extract_narrow<0b000, "sqxtnb">;
- defm SQXTNT_ZZ : sve2_int_sat_extract_narrow<0b001, "sqxtnt">;
- defm UQXTNB_ZZ : sve2_int_sat_extract_narrow<0b010, "uqxtnb">;
- defm UQXTNT_ZZ : sve2_int_sat_extract_narrow<0b011, "uqxtnt">;
- defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow<0b100, "sqxtunb">;
- defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow<0b101, "sqxtunt">;
+ // SVE2 bitwise shift right narrow (bottom)
+ defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb">;
+ defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb">;
+ defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb">;
+ defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb">;
+ defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb">;
+ defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb">;
+ defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb">;
+ defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb">;
+
+ // SVE2 bitwise shift right narrow (top)
+ defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt">;
+ defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt">;
+ defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt">;
+ defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt">;
+ defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt">;
+ defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt">;
+ defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt">;
+ defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt">;
+
+ // SVE2 integer add/subtract narrow high part (bottom)
+ defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb">;
+ defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb">;
+ defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb">;
+ defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb">;
+
+ // SVE2 integer add/subtract narrow high part (top)
+ defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt">;
+ defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt">;
+ defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt">;
+ defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt">;
+
+ // SVE2 saturating extract narrow (bottom)
+ defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb">;
+ defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb">;
+ defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb">;
+
+ // SVE2 saturating extract narrow (top)
+ defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt">;
+ defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt">;
+ defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt">;
// SVE2 character match
defm MATCH_PPzZZ : sve2_char_match<0b0, "match">;
@@ -1289,10 +1352,14 @@ let Predicates = [HasSVE2] in {
// SVE2 histogram generation (vector)
defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">;
+ // SVE2 floating-point base 2 logarithm as integer
+ defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
+
// SVE2 floating-point convert precision
defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">;
defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt">;
defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt">;
+ def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
// SVE2 floating-point pairwise operations
defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp">;
@@ -1321,58 +1388,45 @@ let Predicates = [HasSVE2] in {
def BSL2N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">;
def NBSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">;
- // sve_int_rotate_imm
+ // SVE2 bitwise xor and rotate right by immediate
defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">;
// SVE2 extract vector (immediate offset, constructive)
def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
- // SVE floating-point convert precision
- def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
-
- // SVE floating-point convert to integer
- defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
-
- // Non-temporal contiguous loads (vector + register)
- defm LDNT1SB_ZZR_S : sve2_mem_cldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
- defm LDNT1B_ZZR_S : sve2_mem_cldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>;
- defm LDNT1SH_ZZR_S : sve2_mem_cldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
- defm LDNT1H_ZZR_S : sve2_mem_cldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>;
- defm LDNT1W_ZZR_S : sve2_mem_cldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>;
-
- defm LDNT1SB_ZZR_D : sve2_mem_cldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
- defm LDNT1B_ZZR_D : sve2_mem_cldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>;
- defm LDNT1SH_ZZR_D : sve2_mem_cldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
- defm LDNT1H_ZZR_D : sve2_mem_cldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>;
- defm LDNT1SW_ZZR_D : sve2_mem_cldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
- defm LDNT1W_ZZR_D : sve2_mem_cldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>;
- defm LDNT1D_ZZR_D : sve2_mem_cldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>;
+ // SVE2 non-temporal gather loads
+ defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
+ defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>;
+ defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
+ defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>;
+ defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>;
+
+ defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
+ defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>;
+ defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
+ defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>;
+ defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
+ defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>;
+ defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>;
// SVE2 vector splice (constructive)
defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
- // Predicated shifts
- defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
- defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
- defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
- defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
- defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
-
- // Non-temporal contiguous stores (vector + register)
- defm STNT1B_ZZR_S : sve2_mem_cstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
- defm STNT1H_ZZR_S : sve2_mem_cstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
- defm STNT1W_ZZR_S : sve2_mem_cstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
+ // SVE2 non-temporal scatter stores
+ defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
+ defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
+ defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
- defm STNT1B_ZZR_D : sve2_mem_cstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
- defm STNT1H_ZZR_D : sve2_mem_cstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
- defm STNT1W_ZZR_D : sve2_mem_cstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
- defm STNT1D_ZZR_D : sve2_mem_cstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
+ defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
+ defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
+ defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
+ defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
- // SVE table lookup (three sources)
+ // SVE2 table lookup (three sources)
defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">;
defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">;
- // SVE integer compare scalar count and limit
+ // SVE2 integer compare scalar count and limit
defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">;
defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">;
defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">;
@@ -1383,7 +1437,7 @@ let Predicates = [HasSVE2] in {
defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">;
defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">;
- // SVE pointer conflict compare
+ // SVE2 pointer conflict compare
defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">;
defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">;
}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 60dbace03ca6..ba61ed726e84 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -32,7 +32,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
const AArch64TargetLowering &TLI = *STI.getTargetLowering();
EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
- Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+ Type *IntPtrTy = Type::getInt8PtrTy(*DAG.getContext());
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Node = Dst;
diff --git a/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/lib/Target/AArch64/AArch64SpeculationHardening.cpp
index 3087e6ce441d..7307961ddb5f 100644
--- a/lib/Target/AArch64/AArch64SpeculationHardening.cpp
+++ b/lib/Target/AArch64/AArch64SpeculationHardening.cpp
@@ -106,6 +106,7 @@
#include "llvm/IR/DebugLoc.h"
#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetMachine.h"
#include <cassert>
@@ -115,9 +116,9 @@ using namespace llvm;
#define AARCH64_SPECULATION_HARDENING_NAME "AArch64 speculation hardening pass"
-cl::opt<bool> HardenLoads("aarch64-slh-loads", cl::Hidden,
- cl::desc("Sanitize loads from memory."),
- cl::init(true));
+static cl::opt<bool> HardenLoads("aarch64-slh-loads", cl::Hidden,
+ cl::desc("Sanitize loads from memory."),
+ cl::init(true));
namespace {
@@ -521,7 +522,7 @@ bool AArch64SpeculationHardening::slhLoads(MachineBasicBlock &MBB) {
for (auto Use : MI.uses()) {
if (!Use.isReg())
continue;
- unsigned Reg = Use.getReg();
+ Register Reg = Use.getReg();
// Some loads of floating point data have implicit defs/uses on a
// super register of that floating point data. Some examples:
// $s0 = LDRSui $sp, 22, implicit-def $q0
@@ -561,8 +562,8 @@ bool AArch64SpeculationHardening::expandSpeculationSafeValue(
// miss-speculation isn't happening because we're already inserting barriers
// to guarantee that.
if (!UseControlFlowSpeculationBarrier && !UsesFullSpeculationBarrier) {
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
// Mark this register and all its aliasing registers as needing to be
// value speculation hardened before its next use, by using a CSDB
// barrier instruction.
diff --git a/lib/Target/AArch64/AArch64StackOffset.h b/lib/Target/AArch64/AArch64StackOffset.h
new file mode 100644
index 000000000000..13f12a6c9c30
--- /dev/null
+++ b/lib/Target/AArch64/AArch64StackOffset.h
@@ -0,0 +1,138 @@
+//==--AArch64StackOffset.h ---------------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the StackOffset class, which is used to
+// describe scalable and non-scalable offsets during frame lowering.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H
+
+#include "llvm/Support/MachineValueType.h"
+
+namespace llvm {
+
+/// StackOffset is a wrapper around scalable and non-scalable offsets and is
+/// used in several functions such as 'isAArch64FrameOffsetLegal' and
+/// 'emitFrameOffset()'. StackOffsets are described by MVTs, e.g.
+//
+/// StackOffset(1, MVT::nxv16i8)
+//
+/// would describe an offset as being the size of a single SVE vector.
+///
+/// The class also implements simple arithmetic (addition/subtraction) on these
+/// offsets, e.g.
+//
+/// StackOffset(1, MVT::nxv16i8) + StackOffset(1, MVT::i64)
+//
+/// describes an offset that spans the combined storage required for an SVE
+/// vector and a 64bit GPR.
+class StackOffset {
+ int64_t Bytes;
+ int64_t ScalableBytes;
+
+ explicit operator int() const;
+
+public:
+ using Part = std::pair<int64_t, MVT>;
+
+ StackOffset() : Bytes(0), ScalableBytes(0) {}
+
+ StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() {
+ assert(MVT(T).getSizeInBits() % 8 == 0 &&
+ "Offset type is not a multiple of bytes");
+ *this += Part(Offset, T);
+ }
+
+ StackOffset(const StackOffset &Other)
+ : Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {}
+
+ StackOffset &operator=(const StackOffset &) = default;
+
+ StackOffset &operator+=(const StackOffset::Part &Other) {
+ int64_t OffsetInBytes = Other.first * (Other.second.getSizeInBits() / 8);
+ if (Other.second.isScalableVector())
+ ScalableBytes += OffsetInBytes;
+ else
+ Bytes += OffsetInBytes;
+ return *this;
+ }
+
+ StackOffset &operator+=(const StackOffset &Other) {
+ Bytes += Other.Bytes;
+ ScalableBytes += Other.ScalableBytes;
+ return *this;
+ }
+
+ StackOffset operator+(const StackOffset &Other) const {
+ StackOffset Res(*this);
+ Res += Other;
+ return Res;
+ }
+
+ StackOffset &operator-=(const StackOffset &Other) {
+ Bytes -= Other.Bytes;
+ ScalableBytes -= Other.ScalableBytes;
+ return *this;
+ }
+
+ StackOffset operator-(const StackOffset &Other) const {
+ StackOffset Res(*this);
+ Res -= Other;
+ return Res;
+ }
+
+ StackOffset operator-() const {
+ StackOffset Res = {};
+ const StackOffset Other(*this);
+ Res -= Other;
+ return Res;
+ }
+
+ /// Returns the scalable part of the offset in bytes.
+ int64_t getScalableBytes() const { return ScalableBytes; }
+
+ /// Returns the non-scalable part of the offset in bytes.
+ int64_t getBytes() const { return Bytes; }
+
+ /// Returns the offset in parts to which this frame offset can be
+ /// decomposed for the purpose of describing a frame offset.
+ /// For non-scalable offsets this is simply its byte size.
+ void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors,
+ int64_t &NumDataVectors) const {
+ assert(isValid() && "Invalid frame offset");
+
+ NumBytes = Bytes;
+ NumDataVectors = 0;
+ NumPredicateVectors = ScalableBytes / 2;
+ // This method is used to get the offsets to adjust the frame offset.
+ // If the function requires ADDPL to be used and needs more than two ADDPL
+ // instructions, part of the offset is folded into NumDataVectors so that it
+ // uses ADDVL for part of it, reducing the number of ADDPL instructions.
+ if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
+ NumPredicateVectors > 62) {
+ NumDataVectors = NumPredicateVectors / 8;
+ NumPredicateVectors -= NumDataVectors * 8;
+ }
+ }
+
+ /// Returns whether the offset is known zero.
+ explicit operator bool() const { return Bytes || ScalableBytes; }
+
+ bool isValid() const {
+ // The smallest scalable element supported by scaled SVE addressing
+ // modes are predicates, which are 2 scalable bytes in size. So the scalable
+ // byte offset must always be a multiple of 2.
+ return ScalableBytes % 2 == 0;
+ }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AArch64StackTagging.cpp b/lib/Target/AArch64/AArch64StackTagging.cpp
index 6e99c48bf1d7..e6dbe01d3807 100644
--- a/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -19,6 +19,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -55,9 +56,215 @@ using namespace llvm;
#define DEBUG_TYPE "stack-tagging"
-static constexpr unsigned kTagGranuleSize = 16;
+static cl::opt<bool> ClMergeInit(
+ "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore,
+ cl::desc("merge stack variable initializers with tagging when possible"));
+
+static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit",
+ cl::init(40), cl::Hidden);
+
+static const Align kTagGranuleSize = Align(16);
namespace {
+
+class InitializerBuilder {
+ uint64_t Size;
+ const DataLayout *DL;
+ Value *BasePtr;
+ Function *SetTagFn;
+ Function *SetTagZeroFn;
+ Function *StgpFn;
+
+ // List of initializers sorted by start offset.
+ struct Range {
+ uint64_t Start, End;
+ Instruction *Inst;
+ };
+ SmallVector<Range, 4> Ranges;
+ // 8-aligned offset => 8-byte initializer
+ // Missing keys are zero initialized.
+ std::map<uint64_t, Value *> Out;
+
+public:
+ InitializerBuilder(uint64_t Size, const DataLayout *DL, Value *BasePtr,
+ Function *SetTagFn, Function *SetTagZeroFn,
+ Function *StgpFn)
+ : Size(Size), DL(DL), BasePtr(BasePtr), SetTagFn(SetTagFn),
+ SetTagZeroFn(SetTagZeroFn), StgpFn(StgpFn) {}
+
+ bool addRange(uint64_t Start, uint64_t End, Instruction *Inst) {
+ auto I = std::lower_bound(
+ Ranges.begin(), Ranges.end(), Start,
+ [](const Range &LHS, uint64_t RHS) { return LHS.End <= RHS; });
+ if (I != Ranges.end() && End > I->Start) {
+ // Overlap - bail.
+ return false;
+ }
+ Ranges.insert(I, {Start, End, Inst});
+ return true;
+ }
+
+ bool addStore(uint64_t Offset, StoreInst *SI, const DataLayout *DL) {
+ int64_t StoreSize = DL->getTypeStoreSize(SI->getOperand(0)->getType());
+ if (!addRange(Offset, Offset + StoreSize, SI))
+ return false;
+ IRBuilder<> IRB(SI);
+ applyStore(IRB, Offset, Offset + StoreSize, SI->getOperand(0));
+ return true;
+ }
+
+ bool addMemSet(uint64_t Offset, MemSetInst *MSI) {
+ uint64_t StoreSize = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+ if (!addRange(Offset, Offset + StoreSize, MSI))
+ return false;
+ IRBuilder<> IRB(MSI);
+ applyMemSet(IRB, Offset, Offset + StoreSize,
+ cast<ConstantInt>(MSI->getValue()));
+ return true;
+ }
+
+ void applyMemSet(IRBuilder<> &IRB, int64_t Start, int64_t End,
+ ConstantInt *V) {
+ // Out[] does not distinguish between zero and undef, and we already know
+ // that this memset does not overlap with any other initializer. Nothing to
+ // do for memset(0).
+ if (V->isZero())
+ return;
+ for (int64_t Offset = Start - Start % 8; Offset < End; Offset += 8) {
+ uint64_t Cst = 0x0101010101010101UL;
+ int LowBits = Offset < Start ? (Start - Offset) * 8 : 0;
+ if (LowBits)
+ Cst = (Cst >> LowBits) << LowBits;
+ int HighBits = End - Offset < 8 ? (8 - (End - Offset)) * 8 : 0;
+ if (HighBits)
+ Cst = (Cst << HighBits) >> HighBits;
+ ConstantInt *C =
+ ConstantInt::get(IRB.getInt64Ty(), Cst * V->getZExtValue());
+
+ Value *&CurrentV = Out[Offset];
+ if (!CurrentV) {
+ CurrentV = C;
+ } else {
+ CurrentV = IRB.CreateOr(CurrentV, C);
+ }
+ }
+ }
+
+ // Take a 64-bit slice of the value starting at the given offset (in bytes).
+ // Offset can be negative. Pad with zeroes on both sides when necessary.
+ Value *sliceValue(IRBuilder<> &IRB, Value *V, int64_t Offset) {
+ if (Offset > 0) {
+ V = IRB.CreateLShr(V, Offset * 8);
+ V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty());
+ } else if (Offset < 0) {
+ V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty());
+ V = IRB.CreateShl(V, -Offset * 8);
+ } else {
+ V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty());
+ }
+ return V;
+ }
+
+ void applyStore(IRBuilder<> &IRB, int64_t Start, int64_t End,
+ Value *StoredValue) {
+ StoredValue = flatten(IRB, StoredValue);
+ for (int64_t Offset = Start - Start % 8; Offset < End; Offset += 8) {
+ Value *V = sliceValue(IRB, StoredValue, Offset - Start);
+ Value *&CurrentV = Out[Offset];
+ if (!CurrentV) {
+ CurrentV = V;
+ } else {
+ CurrentV = IRB.CreateOr(CurrentV, V);
+ }
+ }
+ }
+
+ void generate(IRBuilder<> &IRB) {
+ LLVM_DEBUG(dbgs() << "Combined initializer\n");
+ // No initializers => the entire allocation is undef.
+ if (Ranges.empty()) {
+ emitUndef(IRB, 0, Size);
+ return;
+ }
+
+ // Look through 8-byte initializer list 16 bytes at a time;
+ // If one of the two 8-byte halfs is non-zero non-undef, emit STGP.
+ // Otherwise, emit zeroes up to next available item.
+ uint64_t LastOffset = 0;
+ for (uint64_t Offset = 0; Offset < Size; Offset += 16) {
+ auto I1 = Out.find(Offset);
+ auto I2 = Out.find(Offset + 8);
+ if (I1 == Out.end() && I2 == Out.end())
+ continue;
+
+ if (Offset > LastOffset)
+ emitZeroes(IRB, LastOffset, Offset - LastOffset);
+
+ Value *Store1 = I1 == Out.end() ? Constant::getNullValue(IRB.getInt64Ty())
+ : I1->second;
+ Value *Store2 = I2 == Out.end() ? Constant::getNullValue(IRB.getInt64Ty())
+ : I2->second;
+ emitPair(IRB, Offset, Store1, Store2);
+ LastOffset = Offset + 16;
+ }
+
+ // memset(0) does not update Out[], therefore the tail can be either undef
+ // or zero.
+ if (LastOffset < Size)
+ emitZeroes(IRB, LastOffset, Size - LastOffset);
+
+ for (const auto &R : Ranges) {
+ R.Inst->eraseFromParent();
+ }
+ }
+
+ void emitZeroes(IRBuilder<> &IRB, uint64_t Offset, uint64_t Size) {
+ LLVM_DEBUG(dbgs() << " [" << Offset << ", " << Offset + Size
+ << ") zero\n");
+ Value *Ptr = BasePtr;
+ if (Offset)
+ Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+ IRB.CreateCall(SetTagZeroFn,
+ {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+ }
+
+ void emitUndef(IRBuilder<> &IRB, uint64_t Offset, uint64_t Size) {
+ LLVM_DEBUG(dbgs() << " [" << Offset << ", " << Offset + Size
+ << ") undef\n");
+ Value *Ptr = BasePtr;
+ if (Offset)
+ Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+ IRB.CreateCall(SetTagFn, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+ }
+
+ void emitPair(IRBuilder<> &IRB, uint64_t Offset, Value *A, Value *B) {
+ LLVM_DEBUG(dbgs() << " [" << Offset << ", " << Offset + 16 << "):\n");
+ LLVM_DEBUG(dbgs() << " " << *A << "\n " << *B << "\n");
+ Value *Ptr = BasePtr;
+ if (Offset)
+ Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+ IRB.CreateCall(StgpFn, {Ptr, A, B});
+ }
+
+ Value *flatten(IRBuilder<> &IRB, Value *V) {
+ if (V->getType()->isIntegerTy())
+ return V;
+ // vector of pointers -> vector of ints
+ if (VectorType *VecTy = dyn_cast<VectorType>(V->getType())) {
+ LLVMContext &Ctx = IRB.getContext();
+ Type *EltTy = VecTy->getElementType();
+ if (EltTy->isPointerTy()) {
+ uint32_t EltSize = DL->getTypeSizeInBits(EltTy);
+ Type *NewTy = VectorType::get(IntegerType::get(Ctx, EltSize),
+ VecTy->getNumElements());
+ V = IRB.CreatePointerCast(V, NewTy);
+ }
+ }
+ return IRB.CreateBitOrPointerCast(
+ V, IRB.getIntNTy(DL->getTypeStoreSize(V->getType()) * 8));
+ }
+};
+
class AArch64StackTagging : public FunctionPass {
struct AllocaInfo {
AllocaInst *AI;
@@ -67,10 +274,15 @@ class AArch64StackTagging : public FunctionPass {
int Tag; // -1 for non-tagged allocations
};
+ bool MergeInit;
+
public:
static char ID; // Pass ID, replacement for typeid
- AArch64StackTagging() : FunctionPass(ID) {
+ AArch64StackTagging(bool MergeInit = true)
+ : FunctionPass(ID),
+ MergeInit(ClMergeInit.getNumOccurrences() > 0 ? ClMergeInit
+ : MergeInit) {
initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry());
}
@@ -81,6 +293,9 @@ public:
uint64_t Size);
void untagAlloca(AllocaInst *AI, Instruction *InsertBefore, uint64_t Size);
+ Instruction *collectInitializers(Instruction *StartInst, Value *StartPtr,
+ uint64_t Size, InitializerBuilder &IB);
+
Instruction *
insertBaseTaggedPointer(const MapVector<AllocaInst *, AllocaInfo> &Allocas,
const DominatorTree *DT);
@@ -92,9 +307,12 @@ private:
Function *F;
Function *SetTagFunc;
const DataLayout *DL;
+ AAResults *AA;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ if (MergeInit)
+ AU.addRequired<AAResultsWrapperPass>();
}
};
@@ -107,8 +325,68 @@ INITIALIZE_PASS_BEGIN(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
INITIALIZE_PASS_END(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
false, false)
-FunctionPass *llvm::createAArch64StackTaggingPass() {
- return new AArch64StackTagging();
+FunctionPass *llvm::createAArch64StackTaggingPass(bool MergeInit) {
+ return new AArch64StackTagging(MergeInit);
+}
+
+Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst,
+ Value *StartPtr,
+ uint64_t Size,
+ InitializerBuilder &IB) {
+ MemoryLocation AllocaLoc{StartPtr, Size};
+ Instruction *LastInst = StartInst;
+ BasicBlock::iterator BI(StartInst);
+
+ unsigned Count = 0;
+ for (; Count < ClScanLimit && !BI->isTerminator(); ++BI) {
+ if (!isa<DbgInfoIntrinsic>(*BI))
+ ++Count;
+
+ if (isNoModRef(AA->getModRefInfo(&*BI, AllocaLoc)))
+ continue;
+
+ if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
+ // If the instruction is readnone, ignore it, otherwise bail out. We
+ // don't even allow readonly here because we don't want something like:
+ // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+ if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+ break;
+ continue;
+ }
+
+ if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+ if (!NextStore->isSimple())
+ break;
+
+ // Check to see if this store is to a constant offset from the start ptr.
+ Optional<int64_t> Offset =
+ isPointerOffset(StartPtr, NextStore->getPointerOperand(), *DL);
+ if (!Offset)
+ break;
+
+ if (!IB.addStore(*Offset, NextStore, DL))
+ break;
+ LastInst = NextStore;
+ } else {
+ MemSetInst *MSI = cast<MemSetInst>(BI);
+
+ if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+ break;
+
+ if (!isa<ConstantInt>(MSI->getValue()))
+ break;
+
+ // Check to see if this store is to a constant offset from the start ptr.
+ Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), *DL);
+ if (!Offset)
+ break;
+
+ if (!IB.addMemSet(*Offset, MSI))
+ break;
+ LastInst = MSI;
+ }
+ }
+ return LastInst;
}
bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) {
@@ -127,8 +405,23 @@ bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) {
void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore,
Value *Ptr, uint64_t Size) {
+ auto SetTagZeroFunc =
+ Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag_zero);
+ auto StgpFunc =
+ Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_stgp);
+
+ InitializerBuilder IB(Size, DL, Ptr, SetTagFunc, SetTagZeroFunc, StgpFunc);
+ bool LittleEndian =
+ Triple(AI->getModule()->getTargetTriple()).isLittleEndian();
+ // Current implementation of initializer merging assumes little endianness.
+ if (MergeInit && !F->hasOptNone() && LittleEndian) {
+ LLVM_DEBUG(dbgs() << "collecting initializers for " << *AI
+ << ", size = " << Size << "\n");
+ InsertBefore = collectInitializers(InsertBefore, Ptr, Size, IB);
+ }
+
IRBuilder<> IRB(InsertBefore);
- IRB.CreateCall(SetTagFunc, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+ IB.generate(IRB);
}
void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore,
@@ -166,7 +459,8 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer(
}
void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
- unsigned NewAlignment = std::max(Info.AI->getAlignment(), kTagGranuleSize);
+ const Align NewAlignment =
+ max(MaybeAlign(Info.AI->getAlignment()), kTagGranuleSize);
Info.AI->setAlignment(NewAlignment);
uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
@@ -179,7 +473,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
Info.AI->isArrayAllocation()
? ArrayType::get(
Info.AI->getAllocatedType(),
- dyn_cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue())
+ cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue())
: Info.AI->getAllocatedType();
Type *PaddingType =
ArrayType::get(Type::getInt8Ty(F->getContext()), AlignedSize - Size);
@@ -187,7 +481,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
auto *NewAI = new AllocaInst(
TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI);
NewAI->takeName(Info.AI);
- NewAI->setAlignment(Info.AI->getAlignment());
+ NewAI->setAlignment(MaybeAlign(Info.AI->getAlignment()));
NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca());
NewAI->setSwiftError(Info.AI->isSwiftError());
NewAI->copyMetadata(*Info.AI);
@@ -198,6 +492,24 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
Info.AI = NewAI;
}
+// Helper function to check for post-dominance.
+static bool postDominates(const PostDominatorTree *PDT, const IntrinsicInst *A,
+ const IntrinsicInst *B) {
+ const BasicBlock *ABB = A->getParent();
+ const BasicBlock *BBB = B->getParent();
+
+ if (ABB != BBB)
+ return PDT->dominates(ABB, BBB);
+
+ for (const Instruction &I : *ABB) {
+ if (&I == B)
+ return true;
+ if (&I == A)
+ return false;
+ }
+ llvm_unreachable("Corrupt instruction list");
+}
+
// FIXME: check for MTE extension
bool AArch64StackTagging::runOnFunction(Function &Fn) {
if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag))
@@ -205,6 +517,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
F = &Fn;
DL = &Fn.getParent()->getDataLayout();
+ if (MergeInit)
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
MapVector<AllocaInst *, AllocaInfo> Allocas; // need stable iteration order
SmallVector<Instruction *, 8> RetVec;
@@ -270,23 +584,31 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
if (NumInterestingAllocas == 0)
return true;
+ std::unique_ptr<DominatorTree> DeleteDT;
+ DominatorTree *DT = nullptr;
+ if (auto *P = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+ DT = &P->getDomTree();
+
+ if (DT == nullptr && (NumInterestingAllocas > 1 ||
+ !F->hasFnAttribute(Attribute::OptimizeNone))) {
+ DeleteDT = std::make_unique<DominatorTree>(*F);
+ DT = DeleteDT.get();
+ }
+
+ std::unique_ptr<PostDominatorTree> DeletePDT;
+ PostDominatorTree *PDT = nullptr;
+ if (auto *P = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>())
+ PDT = &P->getPostDomTree();
+
+ if (PDT == nullptr && !F->hasFnAttribute(Attribute::OptimizeNone)) {
+ DeletePDT = std::make_unique<PostDominatorTree>(*F);
+ PDT = DeletePDT.get();
+ }
+
SetTagFunc =
Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag);
- // Compute DT only if the function has the attribute, there are more than 1
- // interesting allocas, and it is not available for free.
- Instruction *Base;
- if (NumInterestingAllocas > 1) {
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- if (DTWP) {
- Base = insertBaseTaggedPointer(Allocas, &DTWP->getDomTree());
- } else {
- DominatorTree DT(*F);
- Base = insertBaseTaggedPointer(Allocas, &DT);
- }
- } else {
- Base = insertBaseTaggedPointer(Allocas, nullptr);
- }
+ Instruction *Base = insertBaseTaggedPointer(Allocas, DT);
for (auto &I : Allocas) {
const AllocaInfo &Info = I.second;
@@ -309,11 +631,37 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
if (UnrecognizedLifetimes.empty() && Info.LifetimeStart.size() == 1 &&
Info.LifetimeEnd.size() == 1) {
IntrinsicInst *Start = Info.LifetimeStart[0];
+ IntrinsicInst *End = Info.LifetimeEnd[0];
uint64_t Size =
dyn_cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
Size = alignTo(Size, kTagGranuleSize);
tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size);
- untagAlloca(AI, Info.LifetimeEnd[0], Size);
+ // We need to ensure that if we tag some object, we certainly untag it
+ // before the function exits.
+ if (PDT != nullptr && postDominates(PDT, End, Start)) {
+ untagAlloca(AI, End, Size);
+ } else {
+ SmallVector<Instruction *, 8> ReachableRetVec;
+ unsigned NumCoveredExits = 0;
+ for (auto &RI : RetVec) {
+ if (!isPotentiallyReachable(Start, RI, nullptr, DT))
+ continue;
+ ReachableRetVec.push_back(RI);
+ if (DT != nullptr && DT->dominates(End, RI))
+ ++NumCoveredExits;
+ }
+ // If there's a mix of covered and non-covered exits, just put the untag
+ // on exits, so we avoid the redundancy of untagging twice.
+ if (NumCoveredExits == ReachableRetVec.size()) {
+ untagAlloca(AI, End, Size);
+ } else {
+ for (auto &RI : ReachableRetVec)
+ untagAlloca(AI, RI, Size);
+ // We may have inserted untag outside of the lifetime interval.
+ // Remove the lifetime end call for this alloca.
+ End->eraseFromParent();
+ }
+ }
} else {
uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
Value *Ptr = IRB.CreatePointerCast(TagPCall, IRB.getInt8PtrTy());
diff --git a/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
new file mode 100644
index 000000000000..3cc556f74aea
--- /dev/null
+++ b/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
@@ -0,0 +1,209 @@
+//===-- AArch64StackTaggingPreRA.cpp --- Stack Tagging for AArch64 -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-stack-tagging-pre-ra"
+
+enum UncheckedLdStMode { UncheckedNever, UncheckedSafe, UncheckedAlways };
+
+cl::opt<UncheckedLdStMode> ClUncheckedLdSt(
+ "stack-tagging-unchecked-ld-st", cl::Hidden,
+ cl::init(UncheckedSafe),
+ cl::desc(
+ "Unconditionally apply unchecked-ld-st optimization (even for large "
+ "stack frames, or in the presence of variable sized allocas)."),
+ cl::values(
+ clEnumValN(UncheckedNever, "never", "never apply unchecked-ld-st"),
+ clEnumValN(
+ UncheckedSafe, "safe",
+ "apply unchecked-ld-st when the target is definitely within range"),
+ clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st")));
+
+namespace {
+
+class AArch64StackTaggingPreRA : public MachineFunctionPass {
+ MachineFunction *MF;
+ AArch64FunctionInfo *AFI;
+ MachineFrameInfo *MFI;
+ MachineRegisterInfo *MRI;
+ const AArch64RegisterInfo *TRI;
+ const AArch64InstrInfo *TII;
+
+ SmallVector<MachineInstr*, 16> ReTags;
+
+public:
+ static char ID;
+ AArch64StackTaggingPreRA() : MachineFunctionPass(ID) {
+ initializeAArch64StackTaggingPreRAPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool mayUseUncheckedLoadStore();
+ void uncheckUsesOf(unsigned TaggedReg, int FI);
+ void uncheckLoadsAndStores();
+
+ bool runOnMachineFunction(MachineFunction &Func) override;
+ StringRef getPassName() const override {
+ return "AArch64 Stack Tagging PreRA";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+} // end anonymous namespace
+
+char AArch64StackTaggingPreRA::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra",
+ "AArch64 Stack Tagging PreRA Pass", false, false)
+INITIALIZE_PASS_END(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra",
+ "AArch64 Stack Tagging PreRA Pass", false, false)
+
+FunctionPass *llvm::createAArch64StackTaggingPreRAPass() {
+ return new AArch64StackTaggingPreRA();
+}
+
+static bool isUncheckedLoadOrStoreOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ case AArch64::LDRWui:
+ case AArch64::LDRSHWui:
+ case AArch64::LDRXui:
+ case AArch64::LDRBui:
+ case AArch64::LDRBBui:
+ case AArch64::LDRHui:
+ case AArch64::LDRSui:
+ case AArch64::LDRDui:
+ case AArch64::LDRQui:
+ case AArch64::STRWui:
+ case AArch64::STRXui:
+ case AArch64::STRBui:
+ case AArch64::STRBBui:
+ case AArch64::STRHui:
+ case AArch64::STRSui:
+ case AArch64::STRDui:
+ case AArch64::STRQui:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool AArch64StackTaggingPreRA::mayUseUncheckedLoadStore() {
+ if (ClUncheckedLdSt == UncheckedNever)
+ return false;
+ else if (ClUncheckedLdSt == UncheckedAlways)
+ return true;
+
+ // This estimate can be improved if we had harder guarantees about stack frame
+ // layout. With LocalStackAllocation we can estimate SP offset to any
+ // preallocated slot. AArch64FrameLowering::orderFrameObjects could put tagged
+ // objects ahead of non-tagged ones, but that's not always desirable.
+ //
+ // Underestimating SP offset here may require the use of LDG to materialize
+ // the tagged address of the stack slot, along with a scratch register
+ // allocation (post-regalloc!).
+ //
+ // For now we do the safe thing here and require that the entire stack frame
+ // is within range of the shortest of the unchecked instructions.
+ unsigned FrameSize = 0;
+ for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i)
+ FrameSize += MFI->getObjectSize(i);
+ bool EntireFrameReachableFromSP = FrameSize < 0xf00;
+ return !MFI->hasVarSizedObjects() && EntireFrameReachableFromSP;
+}
+
+void AArch64StackTaggingPreRA::uncheckUsesOf(unsigned TaggedReg, int FI) {
+ for (auto UI = MRI->use_instr_begin(TaggedReg), E = MRI->use_instr_end();
+ UI != E;) {
+ MachineInstr *UseI = &*(UI++);
+ if (isUncheckedLoadOrStoreOpcode(UseI->getOpcode())) {
+ // FI operand is always the one before the immediate offset.
+ unsigned OpIdx = TII->getLoadStoreImmIdx(UseI->getOpcode()) - 1;
+ if (UseI->getOperand(OpIdx).isReg() &&
+ UseI->getOperand(OpIdx).getReg() == TaggedReg) {
+ UseI->getOperand(OpIdx).ChangeToFrameIndex(FI);
+ UseI->getOperand(OpIdx).setTargetFlags(AArch64II::MO_TAGGED);
+ }
+ } else if (UseI->isCopy() &&
+ Register::isVirtualRegister(UseI->getOperand(0).getReg())) {
+ uncheckUsesOf(UseI->getOperand(0).getReg(), FI);
+ }
+ }
+}
+
+void AArch64StackTaggingPreRA::uncheckLoadsAndStores() {
+ for (auto *I : ReTags) {
+ unsigned TaggedReg = I->getOperand(0).getReg();
+ int FI = I->getOperand(1).getIndex();
+ uncheckUsesOf(TaggedReg, FI);
+ }
+}
+
+bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
+ MF = &Func;
+ MRI = &MF->getRegInfo();
+ AFI = MF->getInfo<AArch64FunctionInfo>();
+ TII = static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo());
+ TRI = static_cast<const AArch64RegisterInfo *>(
+ MF->getSubtarget().getRegisterInfo());
+ MFI = &MF->getFrameInfo();
+ ReTags.clear();
+
+ assert(MRI->isSSA());
+
+ LLVM_DEBUG(dbgs() << "********** AArch64 Stack Tagging PreRA **********\n"
+ << "********** Function: " << MF->getName() << '\n');
+
+ SmallSetVector<int, 8> TaggedSlots;
+ for (auto &BB : *MF) {
+ for (auto &I : BB) {
+ if (I.getOpcode() == AArch64::TAGPstack) {
+ ReTags.push_back(&I);
+ int FI = I.getOperand(1).getIndex();
+ TaggedSlots.insert(FI);
+ // There should be no offsets in TAGP yet.
+ assert(I.getOperand(2).getImm() == 0);
+ }
+ }
+ }
+
+ if (ReTags.empty())
+ return false;
+
+ if (mayUseUncheckedLoadStore())
+ uncheckLoadsAndStores();
+
+ return true;
+}
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 0e84a00df006..5deb601822b8 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -151,7 +151,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
int64_t Offset;
if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) &&
BaseOp->isReg()) {
- unsigned BaseReg = BaseOp->getReg();
+ Register BaseReg = BaseOp->getReg();
if (PrevBaseReg == BaseReg) {
// If this block can take STPs, skip ahead to the next block.
if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 3bc89b91c3f7..558bea368eff 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -71,19 +71,22 @@ void AArch64Subtarget::initializeProperties() {
case CortexA35:
break;
case CortexA53:
- PrefFunctionAlignment = 3;
+ PrefFunctionLogAlignment = 3;
break;
case CortexA55:
break;
case CortexA57:
MaxInterleaveFactor = 4;
- PrefFunctionAlignment = 4;
+ PrefFunctionLogAlignment = 4;
+ break;
+ case CortexA65:
+ PrefFunctionLogAlignment = 3;
break;
case CortexA72:
case CortexA73:
case CortexA75:
case CortexA76:
- PrefFunctionAlignment = 4;
+ PrefFunctionLogAlignment = 4;
break;
case Cyclone:
CacheLineSize = 64;
@@ -94,14 +97,14 @@ void AArch64Subtarget::initializeProperties() {
case ExynosM1:
MaxInterleaveFactor = 4;
MaxJumpTableSize = 8;
- PrefFunctionAlignment = 4;
- PrefLoopAlignment = 3;
+ PrefFunctionLogAlignment = 4;
+ PrefLoopLogAlignment = 3;
break;
case ExynosM3:
MaxInterleaveFactor = 4;
MaxJumpTableSize = 20;
- PrefFunctionAlignment = 5;
- PrefLoopAlignment = 4;
+ PrefFunctionLogAlignment = 5;
+ PrefLoopLogAlignment = 4;
break;
case Falkor:
MaxInterleaveFactor = 4;
@@ -122,6 +125,12 @@ void AArch64Subtarget::initializeProperties() {
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
break;
+ case NeoverseE1:
+ PrefFunctionLogAlignment = 3;
+ break;
+ case NeoverseN1:
+ PrefFunctionLogAlignment = 4;
+ break;
case Saphira:
MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
@@ -129,8 +138,8 @@ void AArch64Subtarget::initializeProperties() {
break;
case ThunderX2T99:
CacheLineSize = 64;
- PrefFunctionAlignment = 3;
- PrefLoopAlignment = 2;
+ PrefFunctionLogAlignment = 3;
+ PrefLoopLogAlignment = 2;
MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
@@ -143,15 +152,15 @@ void AArch64Subtarget::initializeProperties() {
case ThunderXT81:
case ThunderXT83:
CacheLineSize = 128;
- PrefFunctionAlignment = 3;
- PrefLoopAlignment = 2;
+ PrefFunctionLogAlignment = 3;
+ PrefLoopLogAlignment = 2;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
break;
case TSV110:
CacheLineSize = 64;
- PrefFunctionAlignment = 4;
- PrefLoopAlignment = 2;
+ PrefFunctionLogAlignment = 4;
+ PrefLoopLogAlignment = 2;
break;
}
}
@@ -187,7 +196,7 @@ const CallLowering *AArch64Subtarget::getCallLowering() const {
return CallLoweringInfo.get();
}
-const InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
+InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
return InstSelector.get();
}
@@ -201,7 +210,7 @@ const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
/// Find the target operand flags that describe how a global value should be
/// referenced for the current subtarget.
-unsigned char
+unsigned
AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
const TargetMachine &TM) const {
// MachO large model always goes via a GOT, simply to get a single 8-byte
@@ -224,10 +233,17 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
GV->hasExternalWeakLinkage())
return AArch64II::MO_GOT;
+ // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
+ // that their nominal addresses are tagged and outside of the code model. In
+ // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
+ // tag if necessary based on MO_TAGGED.
+ if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
+ return AArch64II::MO_NC | AArch64II::MO_TAGGED;
+
return AArch64II::MO_NO_FLAG;
}
-unsigned char AArch64Subtarget::classifyGlobalFunctionReference(
+unsigned AArch64Subtarget::classifyGlobalFunctionReference(
const GlobalValue *GV, const TargetMachine &TM) const {
// MachO large model always goes via a GOT, because we don't have the
// relocations available to do anything else..
@@ -275,7 +291,7 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
std::unique_ptr<PBQPRAConstraint>
AArch64Subtarget::getCustomPBQPConstraints() const {
- return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
+ return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
}
void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 0c84cfb8329a..f3212fae8e5e 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -42,6 +42,7 @@ public:
CortexA53,
CortexA55,
CortexA57,
+ CortexA65,
CortexA72,
CortexA73,
CortexA75,
@@ -51,6 +52,8 @@ public:
ExynosM3,
Falkor,
Kryo,
+ NeoverseE1,
+ NeoverseN1,
Saphira,
ThunderX2T99,
ThunderX,
@@ -113,6 +116,7 @@ protected:
bool HasTRACEV8_4 = false;
bool HasAM = false;
bool HasSEL2 = false;
+ bool HasPMU = false;
bool HasTLB_RMI = false;
bool HasFMI = false;
bool HasRCPC_IMMO = false;
@@ -134,6 +138,7 @@ protected:
bool HasBTI = false;
bool HasRandGen = false;
bool HasMTE = false;
+ bool HasTME = false;
// Arm SVE2 extensions
bool HasSVE2AES = false;
@@ -141,6 +146,10 @@ protected:
bool HasSVE2SHA3 = false;
bool HasSVE2BitPerm = false;
+ // Future architecture extensions.
+ bool HasETE = false;
+ bool HasTRBE = false;
+
// HasZeroCycleRegMove - Has zero-cycle register mov instructions.
bool HasZeroCycleRegMove = false;
@@ -183,14 +192,15 @@ protected:
bool UseEL1ForTP = false;
bool UseEL2ForTP = false;
bool UseEL3ForTP = false;
+ bool AllowTaggedGlobals = false;
uint8_t MaxInterleaveFactor = 2;
uint8_t VectorInsertExtractBaseCost = 3;
uint16_t CacheLineSize = 0;
uint16_t PrefetchDistance = 0;
uint16_t MinPrefetchStride = 1;
unsigned MaxPrefetchIterationsAhead = UINT_MAX;
- unsigned PrefFunctionAlignment = 0;
- unsigned PrefLoopAlignment = 0;
+ unsigned PrefFunctionLogAlignment = 0;
+ unsigned PrefLoopLogAlignment = 0;
unsigned MaxJumpTableSize = 0;
unsigned WideningBaseCost = 0;
@@ -247,7 +257,7 @@ public:
return &getInstrInfo()->getRegisterInfo();
}
const CallLowering *getCallLowering() const override;
- const InstructionSelector *getInstructionSelector() const override;
+ InstructionSelector *getInstructionSelector() const override;
const LegalizerInfo *getLegalizerInfo() const override;
const RegisterBankInfo *getRegBankInfo() const override;
const Triple &getTargetTriple() const { return TargetTriple; }
@@ -344,14 +354,16 @@ public:
unsigned getVectorInsertExtractBaseCost() const {
return VectorInsertExtractBaseCost;
}
- unsigned getCacheLineSize() const { return CacheLineSize; }
- unsigned getPrefetchDistance() const { return PrefetchDistance; }
- unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
- unsigned getMaxPrefetchIterationsAhead() const {
+ unsigned getCacheLineSize() const override { return CacheLineSize; }
+ unsigned getPrefetchDistance() const override { return PrefetchDistance; }
+ unsigned getMinPrefetchStride() const override { return MinPrefetchStride; }
+ unsigned getMaxPrefetchIterationsAhead() const override {
return MaxPrefetchIterationsAhead;
}
- unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
- unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; }
+ unsigned getPrefFunctionLogAlignment() const {
+ return PrefFunctionLogAlignment;
+ }
+ unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; }
unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
@@ -380,6 +392,7 @@ public:
bool hasBTI() const { return HasBTI; }
bool hasRandGen() const { return HasRandGen; }
bool hasMTE() const { return HasMTE; }
+ bool hasTME() const { return HasTME; }
// Arm SVE2 extensions
bool hasSVE2AES() const { return HasSVE2AES; }
bool hasSVE2SM4() const { return HasSVE2SM4; }
@@ -399,6 +412,8 @@ public:
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+ bool isTargetILP32() const { return TargetTriple.isArch32Bit(); }
+
bool useAA() const override { return UseAA; }
bool hasVH() const { return HasVH; }
@@ -421,10 +436,17 @@ public:
bool hasTRACEV8_4() const { return HasTRACEV8_4; }
bool hasAM() const { return HasAM; }
bool hasSEL2() const { return HasSEL2; }
+ bool hasPMU() const { return HasPMU; }
bool hasTLB_RMI() const { return HasTLB_RMI; }
bool hasFMI() const { return HasFMI; }
bool hasRCPC_IMMO() const { return HasRCPC_IMMO; }
+ bool addrSinkUsingGEPs() const override {
+ // Keeping GEPs inbounds is important for exploiting AArch64
+ // addressing-modes in ILP32 mode.
+ return useAA() || isTargetILP32();
+ }
+
bool useSmallAddressing() const {
switch (TLInfo.getTargetMachine().getCodeModel()) {
case CodeModel::Kernel:
@@ -443,11 +465,11 @@ public:
/// ClassifyGlobalReference - Find the target operand flags that describe
/// how a global value should be referenced for the current subtarget.
- unsigned char ClassifyGlobalReference(const GlobalValue *GV,
- const TargetMachine &TM) const;
+ unsigned ClassifyGlobalReference(const GlobalValue *GV,
+ const TargetMachine &TM) const;
- unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
- const TargetMachine &TM) const;
+ unsigned classifyGlobalFunctionReference(const GlobalValue *GV,
+ const TargetMachine &TM) const;
void overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const override;
diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td
index 536a6591478b..05249a4ea6a8 100644
--- a/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/lib/Target/AArch64/AArch64SystemOperands.td
@@ -612,6 +612,7 @@ def : ROSysReg<"ISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b000>;
def : ROSysReg<"CNTPCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b001>;
def : ROSysReg<"CNTVCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b010>;
def : ROSysReg<"ID_MMFR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b110>;
+def : ROSysReg<"ID_MMFR5_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b110>;
// Trace registers
// Op0 Op1 CRn CRm Op2
@@ -1321,6 +1322,12 @@ def : RWSysReg<"CNTHPS_CTL_EL2", 0b11, 0b100, 0b1110, 0b0101, 0b001>;
def : RWSysReg<"SDER32_EL2", 0b11, 0b100, 0b0001, 0b0011, 0b001>;
} // FeatureSEL2
+// v8.4a PMU registers
+// Op0 Op1 CRn CRm Op2
+let Requires = [{ {AArch64::FeaturePMU} }] in {
+def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>;
+} // FeaturePMU
+
// v8.4a RAS registers
// Op0 Op1 CRn CRm Op2
let Requires = [{ {AArch64::FeatureRASv8_4} }] in {
@@ -1452,14 +1459,37 @@ let Requires = [{ {AArch64::FeatureMTE} }] in {
def : RWSysReg<"TCO", 0b11, 0b011, 0b0100, 0b0010, 0b111>;
def : RWSysReg<"GCR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b110>;
def : RWSysReg<"RGSR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b101>;
-def : RWSysReg<"TFSR_EL1", 0b11, 0b000, 0b0110, 0b0101, 0b000>;
-def : RWSysReg<"TFSR_EL2", 0b11, 0b100, 0b0110, 0b0101, 0b000>;
-def : RWSysReg<"TFSR_EL3", 0b11, 0b110, 0b0110, 0b0110, 0b000>;
-def : RWSysReg<"TFSR_EL12", 0b11, 0b101, 0b0110, 0b0110, 0b000>;
-def : RWSysReg<"TFSRE0_EL1", 0b11, 0b000, 0b0110, 0b0110, 0b001>;
+def : RWSysReg<"TFSR_EL1", 0b11, 0b000, 0b0101, 0b0110, 0b000>;
+def : RWSysReg<"TFSR_EL2", 0b11, 0b100, 0b0101, 0b0110, 0b000>;
+def : RWSysReg<"TFSR_EL3", 0b11, 0b110, 0b0101, 0b0110, 0b000>;
+def : RWSysReg<"TFSR_EL12", 0b11, 0b101, 0b0101, 0b0110, 0b000>;
+def : RWSysReg<"TFSRE0_EL1", 0b11, 0b000, 0b0101, 0b0110, 0b001>;
def : ROSysReg<"GMID_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b100>;
} // HasMTE
+// Embedded Trace Extension R/W System registers
+let Requires = [{ {AArch64::FeatureETE} }] in {
+// Name Op0 Op1 CRn CRm Op2
+def : RWSysReg<"TRCRSR", 0b10, 0b001, 0b0000, 0b1010, 0b000>;
+// TRCEXTINSELR0 has the same encoding as ETM TRCEXTINSELR
+def : RWSysReg<"TRCEXTINSELR0", 0b10, 0b001, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"TRCEXTINSELR1", 0b10, 0b001, 0b0000, 0b1001, 0b100>;
+def : RWSysReg<"TRCEXTINSELR2", 0b10, 0b001, 0b0000, 0b1010, 0b100>;
+def : RWSysReg<"TRCEXTINSELR3", 0b10, 0b001, 0b0000, 0b1011, 0b100>;
+} // FeatureETE
+
+// Trace Buffer Extension System registers
+let Requires = [{ {AArch64::FeatureTRBE} }] in {
+// Name Op0 Op1 CRn CRm Op2
+def : RWSysReg<"TRBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b000>;
+def : RWSysReg<"TRBPTR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b001>;
+def : RWSysReg<"TRBBASER_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b010>;
+def : RWSysReg<"TRBSR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b011>;
+def : RWSysReg<"TRBMAR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b100>;
+def : RWSysReg<"TRBTRG_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b110>;
+def : ROSysReg<"TRBIDR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b111>;
+} // FeatureTRBE
+
// Cyclone specific system registers
// Op0 Op1 CRn CRm Op2
let Requires = [{ {AArch64::ProcCyclone} }] in
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 865461480499..b3ed96e815be 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -157,6 +157,8 @@ extern "C" void LLVMInitializeAArch64Target() {
RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
RegisterTargetMachine<AArch64beTargetMachine> Y(getTheAArch64beTarget());
RegisterTargetMachine<AArch64leTargetMachine> Z(getTheARM64Target());
+ RegisterTargetMachine<AArch64leTargetMachine> W(getTheARM64_32Target());
+ RegisterTargetMachine<AArch64leTargetMachine> V(getTheAArch64_32Target());
auto PR = PassRegistry::getPassRegistry();
initializeGlobalISel(*PR);
initializeAArch64A53Fix835769Pass(*PR);
@@ -180,6 +182,7 @@ extern "C" void LLVMInitializeAArch64Target() {
initializeLDTLSCleanupPass(*PR);
initializeAArch64SpeculationHardeningPass(*PR);
initializeAArch64StackTaggingPass(*PR);
+ initializeAArch64StackTaggingPreRAPass(*PR);
}
//===----------------------------------------------------------------------===//
@@ -187,11 +190,11 @@ extern "C" void LLVMInitializeAArch64Target() {
//===----------------------------------------------------------------------===//
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
if (TT.isOSBinFormatMachO())
- return llvm::make_unique<AArch64_MachoTargetObjectFile>();
+ return std::make_unique<AArch64_MachoTargetObjectFile>();
if (TT.isOSBinFormatCOFF())
- return llvm::make_unique<AArch64_COFFTargetObjectFile>();
+ return std::make_unique<AArch64_COFFTargetObjectFile>();
- return llvm::make_unique<AArch64_ELFTargetObjectFile>();
+ return std::make_unique<AArch64_ELFTargetObjectFile>();
}
// Helper function to build a DataLayout string
@@ -200,8 +203,11 @@ static std::string computeDataLayout(const Triple &TT,
bool LittleEndian) {
if (Options.getABIName() == "ilp32")
return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
- if (TT.isOSBinFormatMachO())
+ if (TT.isOSBinFormatMachO()) {
+ if (TT.getArch() == Triple::aarch64_32)
+ return "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128";
return "e-m:o-i64:64-i128:128-n32:64-S128";
+ }
if (TT.isOSBinFormatCOFF())
return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
if (LittleEndian)
@@ -277,8 +283,11 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
this->Options.TrapUnreachable = true;
}
- // Enable GlobalISel at or below EnableGlobalISelAt0.
- if (getOptLevel() <= EnableGlobalISelAtO) {
+ // Enable GlobalISel at or below EnableGlobalISelAt0, unless this is
+ // MachO/CodeModel::Large, which GlobalISel does not support.
+ if (getOptLevel() <= EnableGlobalISelAtO &&
+ TT.getArch() != Triple::aarch64_32 &&
+ !(getCodeModel() == CodeModel::Large && TT.isOSBinFormatMachO())) {
setGlobalISel(true);
setGlobalISelAbort(GlobalISelAbortMode::Disable);
}
@@ -310,7 +319,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
+ I = std::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
isLittle);
}
return I.get();
@@ -448,7 +457,8 @@ void AArch64PassConfig::addIRPasses() {
addPass(createLICMPass());
}
- addPass(createAArch64StackTaggingPass());
+ addPass(createAArch64StackTaggingPass(/* MergeInit = */ TM->getOptLevel() !=
+ CodeGenOpt::None));
}
// Pass Pipeline Configuration
@@ -502,7 +512,8 @@ bool AArch64PassConfig::addIRTranslator() {
}
void AArch64PassConfig::addPreLegalizeMachineIR() {
- addPass(createAArch64PreLegalizeCombiner());
+ bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ addPass(createAArch64PreLegalizeCombiner(IsOptNone));
}
bool AArch64PassConfig::addLegalizeMachineIR() {
@@ -516,9 +527,7 @@ bool AArch64PassConfig::addRegBankSelect() {
}
void AArch64PassConfig::addPreGlobalInstructionSelect() {
- // Workaround the deficiency of the fast register allocator.
- if (TM->getOptLevel() == CodeGenOpt::None)
- addPass(new Localizer());
+ addPass(new Localizer());
}
bool AArch64PassConfig::addGlobalInstructionSelect() {
@@ -540,6 +549,8 @@ bool AArch64PassConfig::addILPOpts() {
if (EnableStPairSuppress)
addPass(createAArch64StorePairSuppressPass());
addPass(createAArch64SIMDInstrOptPass());
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createAArch64StackTaggingPreRAPass());
return true;
}
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 1c3d5d0743ad..54562094fcf5 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -59,8 +59,8 @@ MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol(
}
const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
- const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
- MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+ const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV,
+ int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
assert((Offset+MV.getConstant() == 0) &&
"Arch64 does not support GOT PC rel with extra offset");
// On ARM64 Darwin, we can reference symbols with foo@GOT-., which
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 7ead363d42fe..1cb4c028c80d 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -35,7 +35,8 @@ public:
const TargetMachine &TM,
MachineModuleInfo *MMI) const override;
- const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+ const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV,
+ const MCSymbol *Sym,
const MCValue &MV, int64_t Offset,
MachineModuleInfo *MMI,
MCStreamer &Streamer) const override;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a4b78f2a7d6b..dc916a7b3407 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -618,6 +618,19 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
+AArch64TTIImpl::TTI::MemCmpExpansionOptions
+AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+ TTI::MemCmpExpansionOptions Options;
+ Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+ Options.NumLoadsPerBlock = Options.MaxNumLoads;
+ // TODO: Though vector loads usually perform well on AArch64, in some targets
+ // they may wake up the FP unit, which raises the power consumption. Perhaps
+ // they could be used with no holds barred (-O3).
+ Options.LoadSizes = {8, 4, 2, 1};
+ return Options;
+}
+
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
unsigned Alignment, unsigned AddressSpace,
const Instruction *I) {
@@ -879,22 +892,6 @@ bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
return Considerable;
}
-unsigned AArch64TTIImpl::getCacheLineSize() {
- return ST->getCacheLineSize();
-}
-
-unsigned AArch64TTIImpl::getPrefetchDistance() {
- return ST->getPrefetchDistance();
-}
-
-unsigned AArch64TTIImpl::getMinPrefetchStride() {
- return ST->getMinPrefetchStride();
-}
-
-unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
- return ST->getMaxPrefetchIterationsAhead();
-}
-
bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 10c15a139b4c..32c59f41e1c3 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -85,7 +85,8 @@ public:
bool enableInterleavedAccessVectorization() { return true; }
- unsigned getNumberOfRegisters(bool Vector) {
+ unsigned getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (Vector) {
if (ST->hasNEON())
return 32;
@@ -130,6 +131,9 @@ public:
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
const Instruction *I = nullptr);
+ TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+ bool IsZeroCmp) const;
+
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace, const Instruction *I = nullptr);
@@ -153,14 +157,6 @@ public:
shouldConsiderAddressTypePromotion(const Instruction &I,
bool &AllowPromotionWithoutCommonHeader);
- unsigned getCacheLineSize();
-
- unsigned getPrefetchDistance();
-
- unsigned getMinPrefetchStride();
-
- unsigned getMaxPrefetchIterationsAhead();
-
bool shouldExpandReduction(const IntrinsicInst *II) const {
return false;
}
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index f4c55d48d215..4fb409f020d9 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -935,48 +935,34 @@ public:
return false;
}
- bool isMovZSymbolG3() const {
- return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
+ bool isMovWSymbolG3() const {
+ return isMovWSymbol({AArch64MCExpr::VK_ABS_G3, AArch64MCExpr::VK_PREL_G3});
}
- bool isMovZSymbolG2() const {
- return isMovWSymbol({AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
- AArch64MCExpr::VK_TPREL_G2,
- AArch64MCExpr::VK_DTPREL_G2});
- }
-
- bool isMovZSymbolG1() const {
- return isMovWSymbol({
- AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S,
- AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
- AArch64MCExpr::VK_DTPREL_G1,
- });
- }
-
- bool isMovZSymbolG0() const {
- return isMovWSymbol({AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
- AArch64MCExpr::VK_TPREL_G0,
- AArch64MCExpr::VK_DTPREL_G0});
- }
-
- bool isMovKSymbolG3() const {
- return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
- }
-
- bool isMovKSymbolG2() const {
- return isMovWSymbol(AArch64MCExpr::VK_ABS_G2_NC);
+ bool isMovWSymbolG2() const {
+ return isMovWSymbol(
+ {AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
+ AArch64MCExpr::VK_ABS_G2_NC, AArch64MCExpr::VK_PREL_G2,
+ AArch64MCExpr::VK_PREL_G2_NC, AArch64MCExpr::VK_TPREL_G2,
+ AArch64MCExpr::VK_DTPREL_G2});
}
- bool isMovKSymbolG1() const {
- return isMovWSymbol({AArch64MCExpr::VK_ABS_G1_NC,
- AArch64MCExpr::VK_TPREL_G1_NC,
- AArch64MCExpr::VK_DTPREL_G1_NC});
+ bool isMovWSymbolG1() const {
+ return isMovWSymbol(
+ {AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S,
+ AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_PREL_G1,
+ AArch64MCExpr::VK_PREL_G1_NC, AArch64MCExpr::VK_GOTTPREL_G1,
+ AArch64MCExpr::VK_TPREL_G1, AArch64MCExpr::VK_TPREL_G1_NC,
+ AArch64MCExpr::VK_DTPREL_G1, AArch64MCExpr::VK_DTPREL_G1_NC});
}
- bool isMovKSymbolG0() const {
+ bool isMovWSymbolG0() const {
return isMovWSymbol(
- {AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC,
- AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC});
+ {AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
+ AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_PREL_G0,
+ AArch64MCExpr::VK_PREL_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC,
+ AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_TPREL_G0_NC,
+ AArch64MCExpr::VK_DTPREL_G0, AArch64MCExpr::VK_DTPREL_G0_NC});
}
template<int RegWidth, int Shift>
@@ -1814,7 +1800,7 @@ public:
static std::unique_ptr<AArch64Operand>
CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_Token, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_Token, Ctx);
Op->Tok.Data = Str.data();
Op->Tok.Length = Str.size();
Op->Tok.IsSuffix = IsSuffix;
@@ -1829,7 +1815,7 @@ public:
AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
unsigned ShiftAmount = 0,
unsigned HasExplicitAmount = false) {
- auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_Register, Ctx);
Op->Reg.RegNum = RegNum;
Op->Reg.Kind = Kind;
Op->Reg.ElementWidth = 0;
@@ -1861,7 +1847,7 @@ public:
CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements,
unsigned ElementWidth, RegKind RegisterKind, SMLoc S, SMLoc E,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_VectorList, Ctx);
Op->VectorList.RegNum = RegNum;
Op->VectorList.Count = Count;
Op->VectorList.NumElements = NumElements;
@@ -1874,7 +1860,7 @@ public:
static std::unique_ptr<AArch64Operand>
CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_VectorIndex, Ctx);
Op->VectorIndex.Val = Idx;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -1883,7 +1869,7 @@ public:
static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S,
SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_Immediate, Ctx);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -1894,7 +1880,7 @@ public:
unsigned ShiftAmount,
SMLoc S, SMLoc E,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
Op->ShiftedImm .Val = Val;
Op->ShiftedImm.ShiftAmount = ShiftAmount;
Op->StartLoc = S;
@@ -1904,7 +1890,7 @@ public:
static std::unique_ptr<AArch64Operand>
CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_CondCode, Ctx);
Op->CondCode.Code = Code;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -1913,7 +1899,7 @@ public:
static std::unique_ptr<AArch64Operand>
CreateFPImm(APFloat Val, bool IsExact, SMLoc S, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_FPImm, Ctx);
Op->FPImm.Val = Val.bitcastToAPInt().getSExtValue();
Op->FPImm.IsExact = IsExact;
Op->StartLoc = S;
@@ -1925,7 +1911,7 @@ public:
StringRef Str,
SMLoc S,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_Barrier, Ctx);
Op->Barrier.Val = Val;
Op->Barrier.Data = Str.data();
Op->Barrier.Length = Str.size();
@@ -1939,7 +1925,7 @@ public:
uint32_t MSRReg,
uint32_t PStateField,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_SysReg, Ctx);
Op->SysReg.Data = Str.data();
Op->SysReg.Length = Str.size();
Op->SysReg.MRSReg = MRSReg;
@@ -1952,7 +1938,7 @@ public:
static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S,
SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_SysCR, Ctx);
Op->SysCRImm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -1963,7 +1949,7 @@ public:
StringRef Str,
SMLoc S,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_Prefetch, Ctx);
Op->Prefetch.Val = Val;
Op->Barrier.Data = Str.data();
Op->Barrier.Length = Str.size();
@@ -1976,7 +1962,7 @@ public:
StringRef Str,
SMLoc S,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_PSBHint, Ctx);
Op->PSBHint.Val = Val;
Op->PSBHint.Data = Str.data();
Op->PSBHint.Length = Str.size();
@@ -1989,7 +1975,7 @@ public:
StringRef Str,
SMLoc S,
MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_BTIHint, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_BTIHint, Ctx);
Op->BTIHint.Val = Val << 1 | 32;
Op->BTIHint.Data = Str.data();
Op->BTIHint.Length = Str.size();
@@ -2001,7 +1987,7 @@ public:
static std::unique_ptr<AArch64Operand>
CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
+ auto Op = std::make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
Op->ShiftExtend.Type = ShOp;
Op->ShiftExtend.Amount = Val;
Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
@@ -2840,7 +2826,7 @@ static const struct Extension {
{"sve2-aes", {AArch64::FeatureSVE2AES}},
{"sve2-sm4", {AArch64::FeatureSVE2SM4}},
{"sve2-sha3", {AArch64::FeatureSVE2SHA3}},
- {"bitperm", {AArch64::FeatureSVE2BitPerm}},
+ {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}},
// FIXME: Unsupported extensions
{"pan", {}},
{"lor", {}},
@@ -3260,6 +3246,13 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
.Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
.Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
.Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
+ .Case("prel_g3", AArch64MCExpr::VK_PREL_G3)
+ .Case("prel_g2", AArch64MCExpr::VK_PREL_G2)
+ .Case("prel_g2_nc", AArch64MCExpr::VK_PREL_G2_NC)
+ .Case("prel_g1", AArch64MCExpr::VK_PREL_G1)
+ .Case("prel_g1_nc", AArch64MCExpr::VK_PREL_G1_NC)
+ .Case("prel_g0", AArch64MCExpr::VK_PREL_G0)
+ .Case("prel_g0_nc", AArch64MCExpr::VK_PREL_G0_NC)
.Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
.Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
.Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
@@ -5283,7 +5276,7 @@ bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) {
auto parseOp = [&]() -> bool {
SMLoc L = getLoc();
- const MCExpr *Expr;
+ const MCExpr *Expr = nullptr;
if (check(getParser().parseExpression(Expr), L, "expected expression"))
return true;
const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
@@ -5542,43 +5535,43 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
switch (Kind) {
default:
return Match_InvalidOperand;
- case MCK__35_0:
+ case MCK__HASH_0:
ExpectedVal = 0;
break;
- case MCK__35_1:
+ case MCK__HASH_1:
ExpectedVal = 1;
break;
- case MCK__35_12:
+ case MCK__HASH_12:
ExpectedVal = 12;
break;
- case MCK__35_16:
+ case MCK__HASH_16:
ExpectedVal = 16;
break;
- case MCK__35_2:
+ case MCK__HASH_2:
ExpectedVal = 2;
break;
- case MCK__35_24:
+ case MCK__HASH_24:
ExpectedVal = 24;
break;
- case MCK__35_3:
+ case MCK__HASH_3:
ExpectedVal = 3;
break;
- case MCK__35_32:
+ case MCK__HASH_32:
ExpectedVal = 32;
break;
- case MCK__35_4:
+ case MCK__HASH_4:
ExpectedVal = 4;
break;
- case MCK__35_48:
+ case MCK__HASH_48:
ExpectedVal = 48;
break;
- case MCK__35_6:
+ case MCK__HASH_6:
ExpectedVal = 6;
break;
- case MCK__35_64:
+ case MCK__HASH_64:
ExpectedVal = 64;
break;
- case MCK__35_8:
+ case MCK__HASH_8:
ExpectedVal = 8;
break;
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 6418211a4f55..21ce5785ea5e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -153,9 +153,8 @@ static unsigned AdrImmBits(unsigned Value) {
static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
uint64_t Value, MCContext &Ctx,
const Triple &TheTriple, bool IsResolved) {
- unsigned Kind = Fixup.getKind();
int64_t SignedValue = static_cast<int64_t>(Value);
- switch (Kind) {
+ switch (Fixup.getTargetKind()) {
default:
llvm_unreachable("Unknown fixup kind!");
case AArch64::fixup_aarch64_pcrel_adr_imm21:
@@ -574,7 +573,7 @@ public:
case MCCFIInstruction::OpDefCfa: {
// Defines a frame pointer.
unsigned XReg =
- getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true));
+ getXRegFromWReg(*MRI.getLLVMRegNum(Inst.getRegister(), true));
// Other CFA registers than FP are not supported by compact unwind.
// Fallback on DWARF.
@@ -593,8 +592,8 @@ public:
assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
"Frame pointer not pushed!");
- unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
- unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+ unsigned LRReg = *MRI.getLLVMRegNum(LRPush.getRegister(), true);
+ unsigned FPReg = *MRI.getLLVMRegNum(FPPush.getRegister(), true);
LRReg = getXRegFromWReg(LRReg);
FPReg = getXRegFromWReg(FPReg);
@@ -615,14 +614,14 @@ public:
case MCCFIInstruction::OpOffset: {
// Registers are saved in pairs. We expect there to be two consecutive
// `.cfi_offset' instructions with the appropriate registers specified.
- unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ unsigned Reg1 = *MRI.getLLVMRegNum(Inst.getRegister(), true);
if (i + 1 == e)
return CU::UNWIND_ARM64_MODE_DWARF;
const MCCFIInstruction &Inst2 = Instrs[++i];
if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
return CU::UNWIND_ARM64_MODE_DWARF;
- unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+ unsigned Reg2 = *MRI.getLLVMRegNum(Inst2.getRegister(), true);
// N.B. The encodings must be in register number order, and the X
// registers before the D registers.
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index c871e2c62eac..0fd1ca187be7 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -57,7 +57,7 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32)
static bool isNonILP32reloc(const MCFixup &Fixup,
AArch64MCExpr::VariantKind RefKind,
MCContext &Ctx) {
- if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw)
+ if (Fixup.getTargetKind() != AArch64::fixup_aarch64_movw)
return false;
switch (RefKind) {
case AArch64MCExpr::VK_ABS_G3:
@@ -120,7 +120,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
"Should only be expression-level modifiers here");
if (IsPCRel) {
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
case FK_Data_1:
Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
return ELF::R_AARCH64_NONE;
@@ -184,7 +184,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
} else {
if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx))
return ELF::R_AARCH64_NONE;
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
case FK_NONE:
return ELF::R_AARCH64_NONE;
case FK_Data_1:
@@ -394,6 +394,20 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
return R_CLS(MOVW_SABS_G0);
if (RefKind == AArch64MCExpr::VK_ABS_G0_NC)
return R_CLS(MOVW_UABS_G0_NC);
+ if (RefKind == AArch64MCExpr::VK_PREL_G3)
+ return ELF::R_AARCH64_MOVW_PREL_G3;
+ if (RefKind == AArch64MCExpr::VK_PREL_G2)
+ return ELF::R_AARCH64_MOVW_PREL_G2;
+ if (RefKind == AArch64MCExpr::VK_PREL_G2_NC)
+ return ELF::R_AARCH64_MOVW_PREL_G2_NC;
+ if (RefKind == AArch64MCExpr::VK_PREL_G1)
+ return R_CLS(MOVW_PREL_G1);
+ if (RefKind == AArch64MCExpr::VK_PREL_G1_NC)
+ return ELF::R_AARCH64_MOVW_PREL_G1_NC;
+ if (RefKind == AArch64MCExpr::VK_PREL_G0)
+ return R_CLS(MOVW_PREL_G0);
+ if (RefKind == AArch64MCExpr::VK_PREL_G0_NC)
+ return R_CLS(MOVW_PREL_G0_NC);
if (RefKind == AArch64MCExpr::VK_DTPREL_G2)
return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
if (RefKind == AArch64MCExpr::VK_DTPREL_G1)
@@ -434,5 +448,5 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
std::unique_ptr<MCObjectTargetWriter>
llvm::createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) {
- return llvm::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32);
+ return std::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index d0a544273b8b..1a16468484ad 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -172,7 +172,8 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
int ImmS = MI->getOperand(4).getImm();
if ((Op2.getReg() == AArch64::WZR || Op2.getReg() == AArch64::XZR) &&
- (ImmR == 0 || ImmS < ImmR)) {
+ (ImmR == 0 || ImmS < ImmR) &&
+ STI.getFeatureBits()[AArch64::HasV8_2aOps]) {
// BFC takes precedence over its entire range, sligtly differently to BFI.
int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
int LSB = (BitWidth - ImmR) % BitWidth;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index ecff1ab0a8b3..5926a4f81616 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -30,7 +30,7 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly")));
-AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
+AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) {
// We prefer NEON instructions to be printed in the short, Apple-specific
// form when targeting Darwin.
AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant;
@@ -39,7 +39,8 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
PrivateLabelPrefix = "L";
SeparatorString = "%%";
CommentString = ";";
- CodePointerSize = CalleeSaveStackSlotSize = 8;
+ CalleeSaveStackSlotSize = 8;
+ CodePointerSize = IsILP32 ? 4 : 8;
AlignmentIsInBytes = false;
UsesELFSectionDirectiveForBSS = true;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 36ae92afc8c1..7274ae79f74a 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -23,7 +23,7 @@ class Target;
class Triple;
struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
- explicit AArch64MCAsmInfoDarwin();
+ explicit AArch64MCAsmInfoDarwin(bool IsILP32);
const MCExpr *
getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
MCStreamer &Streamer) const override;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 0a529321edc8..548e399e05a3 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -42,6 +42,13 @@ StringRef AArch64MCExpr::getVariantKindName() const {
case VK_ABS_G0: return ":abs_g0:";
case VK_ABS_G0_S: return ":abs_g0_s:";
case VK_ABS_G0_NC: return ":abs_g0_nc:";
+ case VK_PREL_G3: return ":prel_g3:";
+ case VK_PREL_G2: return ":prel_g2:";
+ case VK_PREL_G2_NC: return ":prel_g2_nc:";
+ case VK_PREL_G1: return ":prel_g1:";
+ case VK_PREL_G1_NC: return ":prel_g1_nc:";
+ case VK_PREL_G0: return ":prel_g0:";
+ case VK_PREL_G0_NC: return ":prel_g0_nc:";
case VK_DTPREL_G2: return ":dtprel_g2:";
case VK_DTPREL_G1: return ":dtprel_g1:";
case VK_DTPREL_G1_NC: return ":dtprel_g1_nc:";
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index ec9c95911628..a82ff2e91426 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -27,12 +27,13 @@ public:
// symbol. E.g. direct, via the GOT, ...
VK_ABS = 0x001,
VK_SABS = 0x002,
- VK_GOT = 0x003,
- VK_DTPREL = 0x004,
- VK_GOTTPREL = 0x005,
- VK_TPREL = 0x006,
- VK_TLSDESC = 0x007,
- VK_SECREL = 0x008,
+ VK_PREL = 0x003,
+ VK_GOT = 0x004,
+ VK_DTPREL = 0x005,
+ VK_GOTTPREL = 0x006,
+ VK_TPREL = 0x007,
+ VK_TLSDESC = 0x008,
+ VK_SECREL = 0x009,
VK_SymLocBits = 0x00f,
// Variants specifying which part of the final address calculation is
@@ -72,6 +73,13 @@ public:
VK_ABS_G0_S = VK_SABS | VK_G0,
VK_ABS_G0_NC = VK_ABS | VK_G0 | VK_NC,
VK_LO12 = VK_ABS | VK_PAGEOFF | VK_NC,
+ VK_PREL_G3 = VK_PREL | VK_G3,
+ VK_PREL_G2 = VK_PREL | VK_G2,
+ VK_PREL_G2_NC = VK_PREL | VK_G2 | VK_NC,
+ VK_PREL_G1 = VK_PREL | VK_G1,
+ VK_PREL_G1_NC = VK_PREL | VK_G1 | VK_NC,
+ VK_PREL_G0 = VK_PREL | VK_G0,
+ VK_PREL_G0_NC = VK_PREL | VK_G0 | VK_NC,
VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC,
VK_GOT_PAGE = VK_GOT | VK_PAGE,
VK_DTPREL_G2 = VK_DTPREL | VK_G2,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index df12274d9470..1d583ec0087b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -241,7 +241,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
const Triple &TheTriple) {
MCAsmInfo *MAI;
if (TheTriple.isOSBinFormatMachO())
- MAI = new AArch64MCAsmInfoDarwin();
+ MAI = new AArch64MCAsmInfoDarwin(TheTriple.getArch() == Triple::aarch64_32);
else if (TheTriple.isWindowsMSVCEnvironment())
MAI = new AArch64MCAsmInfoMicrosoftCOFF();
else if (TheTriple.isOSBinFormatCOFF())
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index b3ce5ef22eef..fc04d37eb362 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -54,7 +54,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
Log2Size = ~0U;
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default:
return false;
@@ -406,6 +406,6 @@ void AArch64MachObjectWriter::recordRelocation(
std::unique_ptr<MCObjectTargetWriter>
llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype,
bool IsILP32) {
- return llvm::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype,
+ return std::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype,
IsILP32);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index a45880a07427..aa50bd05cb71 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -120,7 +120,7 @@ bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
namespace llvm {
std::unique_ptr<MCObjectTargetWriter> createAArch64WinCOFFObjectWriter() {
- return llvm::make_unique<AArch64WinCOFFObjectWriter>();
+ return std::make_unique<AArch64WinCOFFObjectWriter>();
}
} // end namespace llvm
diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td
index 808e59467081..8ccf6aa675ba 100644
--- a/lib/Target/AArch64/SVEInstrFormats.td
+++ b/lib/Target/AArch64/SVEInstrFormats.td
@@ -279,6 +279,19 @@ let Predicates = [HasSVE] in {
defm PTRUES : sve_int_ptrue<0b001, "ptrues">;
}
+//===----------------------------------------------------------------------===//
+// SVE pattern match helpers.
+//===----------------------------------------------------------------------===//
+
+class SVE_1_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+ Instruction inst>
+: Pat<(vtd (op vt1:$Op1)),
+ (inst $Op1)>;
+
+class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+ ValueType vt2, ValueType vt3, Instruction inst>
+: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)),
+ (inst $Op1, $Op2, $Op3)>;
//===----------------------------------------------------------------------===//
// SVE Predicate Misc Group
@@ -403,12 +416,12 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm> {
}
class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
- ZPRRegOp zprty>
-: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, PPRAny:$Pg),
- asm, "\t$Zdn, $Pg",
+ ZPRRegOp zprty, PPRRegOp pprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, pprty:$Pm),
+ asm, "\t$Zdn, $Pm",
"",
[]>, Sched<[]> {
- bits<4> Pg;
+ bits<4> Pm;
bits<5> Zdn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
@@ -416,7 +429,7 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
let Inst{18-16} = opc{4-2};
let Inst{15-11} = 0b10000;
let Inst{10-9} = opc{1-0};
- let Inst{8-5} = Pg;
+ let Inst{8-5} = Pm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
@@ -425,9 +438,16 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
}
multiclass sve_int_count_v<bits<5> opc, string asm> {
- def _H : sve_int_count_v<0b01, opc, asm, ZPR16>;
- def _S : sve_int_count_v<0b10, opc, asm, ZPR32>;
- def _D : sve_int_count_v<0b11, opc, asm, ZPR64>;
+ def _H : sve_int_count_v<0b01, opc, asm, ZPR16, PPR16>;
+ def _S : sve_int_count_v<0b10, opc, asm, ZPR32, PPR32>;
+ def _D : sve_int_count_v<0b11, opc, asm, ZPR64, PPR64>;
+
+ def : InstAlias<asm # "\t$Zdn, $Pm",
+ (!cast<Instruction>(NAME # "_H") ZPR16:$Zdn, PPRAny:$Pm), 0>;
+ def : InstAlias<asm # "\t$Zdn, $Pm",
+ (!cast<Instruction>(NAME # "_S") ZPR32:$Zdn, PPRAny:$Pm), 0>;
+ def : InstAlias<asm # "\t$Zdn, $Pm",
+ (!cast<Instruction>(NAME # "_D") ZPR64:$Zdn, PPRAny:$Pm), 0>;
}
class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm,
@@ -609,11 +629,12 @@ multiclass sve_int_pred_pattern_b_x64<bits<5> opc, string asm> {
//===----------------------------------------------------------------------===//
class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
- RegisterClass srcRegType>
+ ValueType vt, RegisterClass srcRegType,
+ SDPatternOperator op>
: I<(outs zprty:$Zd), (ins srcRegType:$Rn),
asm, "\t$Zd, $Rn",
"",
- []>, Sched<[]> {
+ [(set (vt zprty:$Zd), (op srcRegType:$Rn))]>, Sched<[]> {
bits<5> Rn;
bits<5> Zd;
let Inst{31-24} = 0b00000101;
@@ -623,11 +644,11 @@ class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
let Inst{4-0} = Zd;
}
-multiclass sve_int_perm_dup_r<string asm> {
- def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, GPR32sp>;
- def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, GPR32sp>;
- def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, GPR32sp>;
- def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, GPR64sp>;
+multiclass sve_int_perm_dup_r<string asm, SDPatternOperator op> {
+ def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, nxv16i8, GPR32sp, op>;
+ def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, nxv8i16, GPR32sp, op>;
+ def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, nxv4i32, GPR32sp, op>;
+ def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, nxv2i64, GPR64sp, op>;
def : InstAlias<"mov $Zd, $Rn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, GPR32sp:$Rn), 1>;
@@ -744,7 +765,7 @@ multiclass sve2_int_perm_tbl<string asm> {
}
class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
-: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
[]>, Sched<[]> {
@@ -758,6 +779,8 @@ class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
let Inst{15-10} = 0b001011;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
}
multiclass sve2_int_perm_tbx<string asm> {
@@ -826,10 +849,14 @@ class sve_int_perm_unpk<bits<2> sz16_64, bits<2> opc, string asm,
let Inst{4-0} = Zd;
}
-multiclass sve_int_perm_unpk<bits<2> opc, string asm> {
+multiclass sve_int_perm_unpk<bits<2> opc, string asm, SDPatternOperator op> {
def _H : sve_int_perm_unpk<0b01, opc, asm, ZPR16, ZPR8>;
def _S : sve_int_perm_unpk<0b10, opc, asm, ZPR32, ZPR16>;
def _D : sve_int_perm_unpk<0b11, opc, asm, ZPR64, ZPR32>;
+
+ def : SVE_1_Op_Pat<nxv8i16, op, nxv16i8, !cast<Instruction>(NAME # _H)>;
+ def : SVE_1_Op_Pat<nxv4i32, op, nxv8i16, !cast<Instruction>(NAME # _S)>;
+ def : SVE_1_Op_Pat<nxv2i64, op, nxv4i32, !cast<Instruction>(NAME # _D)>;
}
class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -1197,10 +1224,12 @@ multiclass sve_fp_ftmad<string asm> {
//===----------------------------------------------------------------------===//
class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm,
- ZPRRegOp zprty>
+ ZPRRegOp zprty,
+ ValueType vt, ValueType vt2, SDPatternOperator op>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
- "", []>, Sched<[]> {
+ "",
+ [(set (vt zprty:$Zd), (op (vt zprty:$Zn), (vt2 zprty:$Zm)))]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
@@ -1214,10 +1243,10 @@ class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm,
let Inst{4-0} = Zd;
}
-multiclass sve_fp_3op_u_zd<bits<3> opc, string asm> {
- def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
- def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
- def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
+multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
+ def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16, nxv8f16, nxv8f16, op>;
+ def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32, nxv4f32, nxv4f32, op>;
+ def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64, nxv2f64, nxv2f64, op>;
}
//===----------------------------------------------------------------------===//
@@ -1489,7 +1518,7 @@ multiclass sve_fp_fcadd<string asm> {
class sve2_fp_convert_precision<bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
-: I<(outs zprty1:$Zd), (ins PPR3bAny:$Pg, zprty2:$Zn),
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
"",
[]>, Sched<[]> {
@@ -1504,6 +1533,8 @@ class sve2_fp_convert_precision<bits<4> opc, string asm,
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
}
multiclass sve2_fp_convert_down_narrow<string asm> {
@@ -1998,12 +2029,14 @@ class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1,
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
- let ElementSize = zprty1.ElementSize;
}
-multiclass sve_intx_dot<bit opc, string asm> {
+multiclass sve_intx_dot<bit opc, string asm, SDPatternOperator op> {
def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>;
def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>;
+
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _D)>;
}
//===----------------------------------------------------------------------===//
@@ -2028,22 +2061,27 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
- let ElementSize = ElementSizeNone;
}
-multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm> {
- def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> {
+multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,
+ SDPatternOperator op> {
+ def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> {
bits<2> iop;
bits<3> Zm;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
- def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> {
+ def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> {
bits<1> iop;
bits<4> Zm;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
+
+ def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b:$idx))),
+ (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>;
+ def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b:$idx))),
+ (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>;
}
//===----------------------------------------------------------------------===//
@@ -2399,21 +2437,40 @@ multiclass sve2_misc_bitwise<bits<4> opc, string asm> {
def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>;
}
-multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
- let DestructiveInstType = Destructive, ElementSize = ElementSizeNone in {
- def _B : sve2_misc<0b00, { 0b010, opc }, asm, ZPR8, ZPR8>;
- def _H : sve2_misc<0b01, { 0b010, opc }, asm, ZPR16, ZPR16>;
- def _S : sve2_misc<0b10, { 0b010, opc }, asm, ZPR32, ZPR32>;
- def _D : sve2_misc<0b11, { 0b010, opc }, asm, ZPR64, ZPR64>;
- }
-}
-
multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm> {
def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
}
+class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
+ asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15-11} = 0b10010;
+ let Inst{10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
+ def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8, ZPR8>;
+ def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>;
+ def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>;
+ def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>;
+}
+
class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2,
Operand immtype>
@@ -2451,9 +2508,9 @@ multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
// SVE2 Accumulate Group
//===----------------------------------------------------------------------===//
-class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm,
- ZPRRegOp zprty, Operand immtype>
-: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
+class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm,
+ ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
"", []>, Sched<[]> {
bits<5> Zd;
@@ -2468,38 +2525,40 @@ class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm,
let Inst{10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
}
-multiclass sve2_int_bin_cons_shift_imm_left<bit opc, string asm> {
- def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
- def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> {
+ def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+ def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
let Inst{19} = imm{3};
}
- def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+ def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
let Inst{20-19} = imm{4-3};
}
- def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+ def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
}
-multiclass sve2_int_bin_cons_shift_imm_right<bit opc, string asm> {
- def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
- def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> {
+ def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+ def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
}
- def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+ def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
let Inst{20-19} = imm{4-3};
}
- def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+ def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
}
-class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
- ZPRRegOp zprty, Operand immtype>
+class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm),
asm, "\t$Zda, $Zn, $imm",
"", []>, Sched<[]> {
@@ -2521,15 +2580,15 @@ class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm
let ElementSize = ElementSizeNone;
}
-multiclass sve2_int_bin_accum_cons_shift_imm_right<bits<2> opc, string asm> {
- def _B : sve2_int_bin_accum_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
- def _H : sve2_int_bin_accum_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> {
+ def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+ def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
}
- def _S : sve2_int_bin_accum_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+ def _S : sve2_int_bin_accum_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
let Inst{20-19} = imm{4-3};
}
- def _D : sve2_int_bin_accum_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+ def _D : sve2_int_bin_accum_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
@@ -2607,9 +2666,9 @@ multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm> {
// SVE2 Narrowing Group
//===----------------------------------------------------------------------===//
-class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc,
- string asm, ZPRRegOp zprty1,
- ZPRRegOp zprty2, Operand immtype>
+class sve2_int_bin_shift_imm_narrow_bottom<bits<3> tsz8_64, bits<3> opc,
+ string asm, ZPRRegOp zprty1,
+ ZPRRegOp zprty2, Operand immtype>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
"", []>, Sched<[]> {
@@ -2622,26 +2681,63 @@ class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc,
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-14} = 0b00;
- let Inst{13-10} = opc;
+ let Inst{13-11} = opc;
+ let Inst{10} = 0b0;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
-multiclass sve2_int_bin_cons_shift_imm_right_narrow<bits<4> opc, string asm> {
- def _B : sve2_int_bin_cons_shift_imm_narrow<{0,0,1}, opc, asm, ZPR8, ZPR16,
- vecshiftR8>;
- def _H : sve2_int_bin_cons_shift_imm_narrow<{0,1,?}, opc, asm, ZPR16, ZPR32,
- vecshiftR16> {
+multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm> {
+ def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16,
+ vecshiftR8>;
+ def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32,
+ vecshiftR16> {
let Inst{19} = imm{3};
}
- def _S : sve2_int_bin_cons_shift_imm_narrow<{1,?,?}, opc, asm, ZPR32, ZPR64,
- vecshiftR32> {
+ def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64,
+ vecshiftR32> {
let Inst{20-19} = imm{4-3};
}
}
-class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm,
- ZPRRegOp zprty1, ZPRRegOp zprty2>
+class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc,
+ string asm, ZPRRegOp zprty1,
+ ZPRRegOp zprty2, Operand immtype>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, immtype:$imm),
+ asm, "\t$Zd, $Zn, $imm",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> imm;
+ let Inst{31-23} = 0b010001010;
+ let Inst{22} = tsz8_64{2};
+ let Inst{21} = 0b1;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-16} = imm{2-0}; // imm3
+ let Inst{15-14} = 0b00;
+ let Inst{13-11} = opc;
+ let Inst{10} = 0b1;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm> {
+ def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16,
+ vecshiftR8>;
+ def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32,
+ vecshiftR16> {
+ let Inst{19} = imm{3};
+ }
+ def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64,
+ vecshiftR32> {
+ let Inst{20-19} = imm{4-3};
+ }
+}
+
+class sve2_int_addsub_narrow_high_bottom<bits<2> sz, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zd;
@@ -2652,19 +2748,46 @@ class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm,
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b011;
- let Inst{12-10} = opc; // S, R, T
+ let Inst{12-11} = opc; // S, R
+ let Inst{10} = 0b0; // Top
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
-multiclass sve2_int_addsub_narrow_high<bits<3> opc, string asm> {
- def _B : sve2_int_addsub_narrow_high<0b01, opc, asm, ZPR8, ZPR16>;
- def _H : sve2_int_addsub_narrow_high<0b10, opc, asm, ZPR16, ZPR32>;
- def _S : sve2_int_addsub_narrow_high<0b11, opc, asm, ZPR32, ZPR64>;
+multiclass sve2_int_addsub_narrow_high_bottom<bits<2> opc, string asm> {
+ def _B : sve2_int_addsub_narrow_high_bottom<0b01, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_addsub_narrow_high_bottom<0b10, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_addsub_narrow_high_bottom<0b11, opc, asm, ZPR32, ZPR64>;
+}
+
+class sve2_int_addsub_narrow_high_top<bits<2> sz, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
+ asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = 0b011;
+ let Inst{12-11} = opc; // S, R
+ let Inst{10} = 0b1; // Top
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_addsub_narrow_high_top<bits<2> opc, string asm> {
+ def _B : sve2_int_addsub_narrow_high_top<0b01, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_addsub_narrow_high_top<0b10, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_addsub_narrow_high_top<0b11, opc, asm, ZPR32, ZPR64>;
}
-class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm,
- ZPRRegOp zprty1, ZPRRegOp zprty2>
+class sve2_int_sat_extract_narrow_bottom<bits<3> tsz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
bits<5> Zd;
@@ -2674,15 +2797,41 @@ class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm,
let Inst{21} = 0b1;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-13} = 0b000010;
- let Inst{12-10} = opc;
+ let Inst{12-11} = opc;
+ let Inst{10} = 0b0;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
-multiclass sve2_int_sat_extract_narrow<bits<3> opc, string asm> {
- def _B : sve2_int_sat_extract_narrow<0b001, opc, asm, ZPR8, ZPR16>;
- def _H : sve2_int_sat_extract_narrow<0b010, opc, asm, ZPR16, ZPR32>;
- def _S : sve2_int_sat_extract_narrow<0b100, opc, asm, ZPR32, ZPR64>;
+multiclass sve2_int_sat_extract_narrow_bottom<bits<2> opc, string asm> {
+ def _B : sve2_int_sat_extract_narrow_bottom<0b001, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_sat_extract_narrow_bottom<0b010, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_sat_extract_narrow_bottom<0b100, opc, asm, ZPR32, ZPR64>;
+}
+
+class sve2_int_sat_extract_narrow_top<bits<3> tsz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn),
+ asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-23} = 0b010001010;
+ let Inst{22} = tsz8_64{2};
+ let Inst{21} = 0b1;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-13} = 0b000010;
+ let Inst{12-11} = opc;
+ let Inst{10} = 0b1;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_sat_extract_narrow_top<bits<2> opc, string asm> {
+ def _B : sve2_int_sat_extract_narrow_top<0b001, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_sat_extract_narrow_top<0b010, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_sat_extract_narrow_top<0b100, opc, asm, ZPR32, ZPR64>;
}
//===----------------------------------------------------------------------===//
@@ -2713,11 +2862,17 @@ class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc,
let ElementSize = zprty.ElementSize;
}
-multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm> {
+multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm,
+ SDPatternOperator op> {
def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>;
def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm> {
@@ -2735,11 +2890,21 @@ multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm> {
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
}
-multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm> {
+multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm,
+ SDPatternOperator op> {
def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>;
def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
+
+ def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+ def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+ def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+ def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
}
multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm> {
@@ -3886,9 +4051,9 @@ multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty,
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
}
-class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm,
- RegisterOperand VecList>
-: I<(outs VecList:$Zt), iops,
+class sve2_mem_sstnt_vs_base<bits<3> opc, string asm,
+ RegisterOperand listty, ZPRRegOp zprty>
+: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
asm, "\t$Zt, $Pg, [$Zn, $Rm]",
"",
[]>, Sched<[]> {
@@ -3908,17 +4073,14 @@ class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm,
let mayStore = 1;
}
-multiclass sve2_mem_cstnt_vs<bits<3> opc, string asm,
+multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm,
RegisterOperand listty, ZPRRegOp zprty> {
- def _REAL : sve2_mem_cstnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
- asm, listty>;
+ def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
- def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
}
@@ -4147,6 +4309,14 @@ class sve_int_perm_punpk<bit opc, string asm>
let Inst{3-0} = Pd;
}
+multiclass sve_int_perm_punpk<bit opc, string asm, SDPatternOperator op> {
+ def NAME : sve_int_perm_punpk<opc, asm>;
+
+ def : SVE_1_Op_Pat<nxv8i1, op, nxv16i1, !cast<Instruction>(NAME)>;
+ def : SVE_1_Op_Pat<nxv4i1, op, nxv8i1, !cast<Instruction>(NAME)>;
+ def : SVE_1_Op_Pat<nxv2i1, op, nxv4i1, !cast<Instruction>(NAME)>;
+}
+
class sve_int_rdffr_pred<bit s, string asm>
: I<(outs PPR8:$Pd), (ins PPRAny:$Pg),
asm, "\t$Pd, $Pg/z",
@@ -5094,7 +5264,7 @@ multiclass sve_mem_p_fill<string asm> {
(!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
}
-class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm,
+class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm,
RegisterOperand VecList>
: I<(outs VecList:$Zt), iops,
asm, "\t$Zt, $Pg/z, [$Zn, $Rm]",
@@ -5119,17 +5289,15 @@ class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm,
let mayLoad = 1;
}
-multiclass sve2_mem_cldnt_vs<bits<5> opc, string asm,
+multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm,
RegisterOperand listty, ZPRRegOp zprty> {
- def _REAL : sve2_mem_cldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
+ def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
- def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 7bb075c36e79..c27fc7a112ec 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -125,7 +125,7 @@ namespace llvm {
uint32_t AArch64SysReg::parseGenericRegister(StringRef Name) {
// Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name
- Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$");
+ static const Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$");
std::string UpperName = Name.upper();
SmallVector<StringRef, 5> Ops;
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index e5e2fc2cb0df..7a4fcac09ec4 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -313,9 +313,9 @@ struct SysAlias {
uint16_t Encoding;
FeatureBitset FeaturesRequired;
- SysAlias (const char *N, uint16_t E) : Name(N), Encoding(E) {};
- SysAlias (const char *N, uint16_t E, FeatureBitset F) :
- Name(N), Encoding(E), FeaturesRequired(F) {};
+ constexpr SysAlias(const char *N, uint16_t E) : Name(N), Encoding(E) {}
+ constexpr SysAlias(const char *N, uint16_t E, FeatureBitset F)
+ : Name(N), Encoding(E), FeaturesRequired(F) {}
bool haveFeatures(FeatureBitset ActiveFeatures) const {
return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
@@ -326,9 +326,10 @@ struct SysAlias {
struct SysAliasReg : SysAlias {
bool NeedsReg;
- SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {};
- SysAliasReg(const char *N, uint16_t E, bool R, FeatureBitset F) : SysAlias(N, E, F),
- NeedsReg(R) {};
+ constexpr SysAliasReg(const char *N, uint16_t E, bool R)
+ : SysAlias(N, E), NeedsReg(R) {}
+ constexpr SysAliasReg(const char *N, uint16_t E, bool R, FeatureBitset F)
+ : SysAlias(N, E, F), NeedsReg(R) {}
};
namespace AArch64AT{
@@ -627,6 +628,18 @@ namespace AArch64II {
/// MO_S - Indicates that the bits of the symbol operand represented by
/// MO_G0 etc are signed.
MO_S = 0x100,
+
+ /// MO_PREL - Indicates that the bits of the symbol operand represented by
+ /// MO_G0 etc are PC relative.
+ MO_PREL = 0x200,
+
+ /// MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag
+ /// in bits 56-63.
+ /// On a FrameIndex operand, indicates that the underlying memory is tagged
+ /// with an unknown tag value (MTE); this needs to be lowered either to an
+ /// SP-relative load or store instruction (which do not check tags), or to
+ /// an LDG instruction to obtain the tag value.
+ MO_TAGGED = 0x400,
};
} // end namespace AArch64II
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 19a8bd901629..b64422ae5427 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -188,6 +188,10 @@ ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
ModulePass *createR600OpenCLImageTypeLoweringPass();
FunctionPass *createAMDGPUAnnotateUniformValues();
+ModulePass *createAMDGPUPrintfRuntimeBinding();
+void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&);
+extern char &AMDGPUPrintfRuntimeBindingID;
+
ModulePass* createAMDGPUUnifyMetadataPass();
void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
extern char &AMDGPUUnifyMetadataID;
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index baeba534012c..42b477e07b3b 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -10,6 +10,15 @@ include "llvm/TableGen/SearchableTable.td"
include "llvm/Target/Target.td"
include "AMDGPUFeatures.td"
+def p0 : PtrValueType<i64, 0>;
+def p1 : PtrValueType<i64, 1>;
+def p2 : PtrValueType<i32, 2>;
+def p3 : PtrValueType<i32, 3>;
+def p4 : PtrValueType<i64, 4>;
+def p5 : PtrValueType<i32, 5>;
+def p6 : PtrValueType<i32, 6>;
+
+
class BoolToList<bit Value> {
list<int> ret = !if(Value, [1]<int>, []<int>);
}
@@ -145,6 +154,12 @@ def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
"Some GFX10 bug with misaligned multi-dword LDS access in WGP mode"
>;
+def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug",
+ "HasMFMAInlineLiteralBug",
+ "true",
+ "MFMA cannot use inline literal as SrcC"
+>;
+
def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard",
"HasVcmpxPermlaneHazard",
"true",
@@ -802,6 +817,7 @@ def FeatureISAVersion9_0_8 : FeatureSet<
FeaturePkFmacF16Inst,
FeatureAtomicFaddInsts,
FeatureSRAMECC,
+ FeatureMFMAInlineLiteralBug,
FeatureCodeObjectV3]>;
def FeatureISAVersion9_0_9 : FeatureSet<
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 419ebb2240ad..e72b3f4fde63 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -173,6 +173,9 @@ static StringRef intrinsicToAttrName(Intrinsic::ID ID,
case Intrinsic::amdgcn_implicitarg_ptr:
return "amdgpu-implicitarg-ptr";
case Intrinsic::amdgcn_queue_ptr:
+ case Intrinsic::amdgcn_is_shared:
+ case Intrinsic::amdgcn_is_private:
+ // TODO: Does not require queue ptr on gfx9+
case Intrinsic::trap:
case Intrinsic::debugtrap:
IsQueuePtr = true;
@@ -194,18 +197,12 @@ static bool handleAttr(Function &Parent, const Function &Callee,
static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
bool &NeedQueuePtr) {
// X ids unnecessarily propagated to kernels.
- static const StringRef AttrNames[] = {
- { "amdgpu-work-item-id-x" },
- { "amdgpu-work-item-id-y" },
- { "amdgpu-work-item-id-z" },
- { "amdgpu-work-group-id-x" },
- { "amdgpu-work-group-id-y" },
- { "amdgpu-work-group-id-z" },
- { "amdgpu-dispatch-ptr" },
- { "amdgpu-dispatch-id" },
- { "amdgpu-kernarg-segment-ptr" },
- { "amdgpu-implicitarg-ptr" }
- };
+ static constexpr StringLiteral AttrNames[] = {
+ "amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
+ "amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
+ "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
+ "amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
+ "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"};
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
NeedQueuePtr = true;
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 097730441ed8..f0e7ee910f95 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -48,8 +48,8 @@ public:
return ArgDescriptor(Reg, Mask, false, true);
}
- static ArgDescriptor createStack(Register Reg, unsigned Mask = ~0u) {
- return ArgDescriptor(Reg, Mask, true, true);
+ static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
+ return ArgDescriptor(Offset, Mask, true, true);
}
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 743ac64b8f10..f2d903c8e7b1 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -229,7 +229,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
// alignment.
Streamer.EmitValueToAlignment(64, 0, 1, 0);
if (ReadOnlySection.getAlignment() < 64)
- ReadOnlySection.setAlignment(64);
+ ReadOnlySection.setAlignment(Align(64));
const MCSubtargetInfo &STI = MF->getSubtarget();
@@ -273,7 +273,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
AsmPrinter::EmitFunctionEntryLabel();
}
-void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
// Write a line for the basic block label if it is not only fallthrough.
DisasmLines.push_back(
@@ -342,6 +342,8 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Print comments that apply to both callable functions and entry points.
void AMDGPUAsmPrinter::emitCommonFunctionComments(
uint32_t NumVGPR,
+ Optional<uint32_t> NumAGPR,
+ uint32_t TotalNumVGPR,
uint32_t NumSGPR,
uint64_t ScratchSize,
uint64_t CodeSize,
@@ -349,6 +351,11 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
+ if (NumAGPR) {
+ OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
+ OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
+ false);
+ }
OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
false);
@@ -417,7 +424,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// The starting address of all shader programs must be 256 bytes aligned.
// Regular functions just need the basic required instruction alignment.
- MF.setAlignment(MFI->isEntryFunction() ? 8 : 2);
+ MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
SetupMachineFunction(MF);
@@ -474,6 +481,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
emitCommonFunctionComments(
Info.NumVGPR,
+ STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(),
+ Info.getTotalNumVGPRs(STM),
Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
Info.PrivateSegmentSize,
getFunctionCodeSize(MF), MFI);
@@ -481,7 +490,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
}
OutStreamer->emitRawComment(" Kernel info:", false);
- emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+ emitCommonFunctionComments(CurrentProgramInfo.NumArchVGPR,
+ STM.hasMAIInsts()
+ ? CurrentProgramInfo.NumAccVGPR
+ : Optional<uint32_t>(),
+ CurrentProgramInfo.NumVGPR,
CurrentProgramInfo.NumSGPR,
CurrentProgramInfo.ScratchSize,
getFunctionCodeSize(MF), MFI);
@@ -507,6 +520,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
OutStreamer->emitRawComment(
+ " Occupancy: " +
+ Twine(CurrentProgramInfo.Occupancy), false);
+
+ OutStreamer->emitRawComment(
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
OutStreamer->emitRawComment(
@@ -588,6 +605,11 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
UsesVCC, UsesFlatScratch);
}
+int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs(
+ const GCNSubtarget &ST) const {
+ return std::max(NumVGPR, NumAGPR);
+}
+
AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
const MachineFunction &MF) const {
SIFunctionResourceInfo Info;
@@ -634,11 +656,18 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
HighestVGPRReg = Reg;
break;
}
- MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg);
- if (MRI.isPhysRegUsed(AReg)) {
- HighestVGPRReg = AReg;
- break;
+ }
+
+ if (ST.hasMAIInsts()) {
+ MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestAGPRReg = Reg;
+ break;
+ }
}
+ Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 :
+ TRI.getHWRegIndex(HighestAGPRReg) + 1;
}
MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
@@ -660,6 +689,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
}
int32_t MaxVGPR = -1;
+ int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
uint64_t CalleeFrameSize = 0;
@@ -669,11 +699,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
bool IsSGPR = false;
+ bool IsAGPR = false;
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
switch (Reg) {
case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
@@ -744,6 +775,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 1;
} else if (AMDGPU::AGPR_32RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 1;
} else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
@@ -755,6 +787,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 2;
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
@@ -771,6 +804,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 4;
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 4;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
@@ -790,6 +824,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 16;
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 16;
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
IsSGPR = true;
@@ -799,6 +834,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 32;
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 32;
} else {
llvm_unreachable("Unknown register class");
@@ -807,6 +843,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
int MaxUsed = HWReg + Width - 1;
if (IsSGPR) {
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+ } else if (IsAGPR) {
+ MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
} else {
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
}
@@ -828,6 +866,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
MaxVGPR = std::max(MaxVGPR, 23);
+ MaxAGPR = std::max(MaxAGPR, 23);
CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
Info.UsesVCC = true;
@@ -852,6 +891,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
+ MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
CalleeFrameSize
= std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
Info.UsesVCC |= I->second.UsesVCC;
@@ -868,6 +908,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
+ Info.NumAGPR = MaxAGPR + 1;
Info.PrivateSegmentSize += CalleeFrameSize;
return Info;
@@ -876,8 +917,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) {
SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- ProgInfo.NumVGPR = Info.NumVGPR;
+ ProgInfo.NumArchVGPR = Info.NumVGPR;
+ ProgInfo.NumAccVGPR = Info.NumAGPR;
+ ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
ProgInfo.NumSGPR = Info.NumExplicitSGPR;
ProgInfo.ScratchSize = Info.PrivateSegmentSize;
ProgInfo.VCCUsed = Info.UsesVCC;
@@ -890,7 +934,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
MF.getFunction().getContext().diagnose(DiagStackSize);
}
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
@@ -1057,6 +1100,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
S_00B84C_EXCP_EN(0);
+
+ ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize,
+ ProgInfo.NumSGPRsForWavesPerEU,
+ ProgInfo.NumVGPRsForWavesPerEU);
}
static unsigned getRsrcReg(CallingConv::ID CallConv) {
@@ -1214,17 +1261,16 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (STM.isXNACKEnabled())
Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
- unsigned MaxKernArgAlign;
+ Align MaxKernArgAlign;
Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
- // These alignment values are specified in powers of two, so alignment =
- // 2^n. The minimum alignment is 2^4 = 16.
- Out.kernarg_segment_alignment = std::max<size_t>(4,
- countTrailingZeros(MaxKernArgAlign));
+ // kernarg_segment_alignment is specified as log of the alignment.
+ // The minimum alignment is 16.
+ Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
}
bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index cf77034329ef..c50c19a4609c 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -43,6 +43,7 @@ private:
// Track the number of explicitly used VGPRs. Special registers reserved at
// the end are tracked separately.
int32_t NumVGPR = 0;
+ int32_t NumAGPR = 0;
int32_t NumExplicitSGPR = 0;
uint64_t PrivateSegmentSize = 0;
bool UsesVCC = false;
@@ -51,6 +52,7 @@ private:
bool HasRecursion = false;
int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
+ int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
};
SIProgramInfo CurrentProgramInfo;
@@ -77,6 +79,8 @@ private:
void EmitPALMetadata(const MachineFunction &MF,
const SIProgramInfo &KernelInfo);
void emitCommonFunctionComments(uint32_t NumVGPR,
+ Optional<uint32_t> NumAGPR,
+ uint32_t TotalNumVGPR,
uint32_t NumSGPR,
uint64_t ScratchSize,
uint64_t CodeSize,
@@ -125,7 +129,7 @@ public:
void EmitFunctionEntryLabel() override;
- void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
+ void EmitBasicBlockStart(const MachineBasicBlock &MBB) override;
void EmitGlobalVariable(const GlobalVariable *GV) override;
@@ -140,8 +144,8 @@ public:
const char *ExtraCode, raw_ostream &O) override;
protected:
- mutable std::vector<std::string> DisasmLines, HexLines;
- mutable size_t DisasmLineMaxLen;
+ std::vector<std::string> DisasmLines, HexLines;
+ size_t DisasmLineMaxLen;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 8a92e7d923fb..ba8343142c63 100644
--- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
@@ -24,20 +25,10 @@
#define DEBUG_TYPE "amdgpu-atomic-optimizer"
using namespace llvm;
+using namespace llvm::AMDGPU;
namespace {
-enum DPP_CTRL {
- DPP_ROW_SR1 = 0x111,
- DPP_ROW_SR2 = 0x112,
- DPP_ROW_SR3 = 0x113,
- DPP_ROW_SR4 = 0x114,
- DPP_ROW_SR8 = 0x118,
- DPP_WF_SR1 = 0x138,
- DPP_ROW_BCAST15 = 0x142,
- DPP_ROW_BCAST31 = 0x143
-};
-
struct ReplacementInfo {
Instruction *I;
AtomicRMWInst::BinOp Op;
@@ -52,9 +43,12 @@ private:
const LegacyDivergenceAnalysis *DA;
const DataLayout *DL;
DominatorTree *DT;
- bool HasDPP;
+ const GCNSubtarget *ST;
bool IsPixelShader;
+ Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const;
+ Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
bool ValDivergent) const;
@@ -93,8 +87,7 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
DT = DTW ? &DTW->getDomTree() : nullptr;
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- HasDPP = ST.hasDPP();
+ ST = &TM.getSubtarget<GCNSubtarget>(F);
IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
visit(F);
@@ -142,17 +135,18 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
// If the pointer operand is divergent, then each lane is doing an atomic
// operation on a different address, and we cannot optimize that.
- if (DA->isDivergent(I.getOperand(PtrIdx))) {
+ if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) {
return;
}
- const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
+ const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32
// bits.
- if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
+ if (ValDivergent &&
+ (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
return;
}
@@ -219,20 +213,21 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
const unsigned ValIdx = 0;
- const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
+ const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32
// bits.
- if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
+ if (ValDivergent &&
+ (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
return;
}
// If any of the other arguments to the intrinsic are divergent, we can't
// optimize the operation.
for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
- if (DA->isDivergent(I.getOperand(Idx))) {
+ if (DA->isDivergentUse(&I.getOperandUse(Idx))) {
return;
}
}
@@ -282,6 +277,111 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
return B.CreateSelect(Cond, LHS, RHS);
}
+// Use the builder to create an inclusive scan of V across the wavefront, with
+// all lanes active.
+Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
+ Value *V, Value *const Identity) const {
+ Type *const Ty = V->getType();
+ Module *M = B.GetInsertBlock()->getModule();
+ Function *UpdateDPP =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+ Function *PermLaneX16 =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+
+ for (unsigned Idx = 0; Idx < 4; Idx++) {
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
+ B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
+ }
+ if (ST->hasDPPBroadcasts()) {
+ // GFX9 has DPP row broadcast operations.
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
+ B.getInt32(0xf), B.getFalse()}));
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
+ B.getInt32(0xf), B.getFalse()}));
+ } else {
+ // On GFX10 all DPP operations are confined to a single row. To get cross-
+ // row operations we have to use permlane or readlane.
+
+ // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
+ // 48..63).
+ Value *const PermX =
+ B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1),
+ B.getFalse(), B.getFalse()});
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
+ B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
+ if (!ST->isWave32()) {
+ // Combine lane 31 into lanes 32..63.
+ Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)});
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
+ B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
+ }
+ }
+ return V;
+}
+
+// Use the builder to create a shift right of V across the wavefront, with all
+// lanes active, to turn an inclusive scan into an exclusive scan.
+Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
+ Value *const Identity) const {
+ Type *const Ty = V->getType();
+ Module *M = B.GetInsertBlock()->getModule();
+ Function *UpdateDPP =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Function *WriteLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+
+ if (ST->hasDPPWavefrontShifts()) {
+ // GFX9 has DPP wavefront shift operations.
+ V = B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
+ B.getInt32(0xf), B.getFalse()});
+ } else {
+ // On GFX10 all DPP operations are confined to a single row. To get cross-
+ // row operations we have to use permlane or readlane.
+ Value *Old = V;
+ V = B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
+ B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
+
+ // Copy the old lane 15 to the new lane 16.
+ V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
+ B.getInt32(16), V});
+
+ if (!ST->isWave32()) {
+ // Copy the old lane 31 to the new lane 32.
+ V = B.CreateCall(
+ WriteLane,
+ {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
+
+ // Copy the old lane 47 to the new lane 48.
+ V = B.CreateCall(
+ WriteLane,
+ {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
+ }
+ }
+
+ return V;
+}
+
static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
unsigned BitWidth) {
switch (Op) {
@@ -345,23 +445,29 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// We need to know how many lanes are active within the wavefront, and we do
// this by doing a ballot of active lanes.
+ Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
CallInst *const Ballot = B.CreateIntrinsic(
- Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()},
+ Intrinsic::amdgcn_icmp, {WaveTy, B.getInt32Ty()},
{B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)});
// We need to know how many lanes are active within the wavefront that are
// below us. If we counted each lane linearly starting from 0, a lane is
// below us only if its associated index was less than ours. We do this by
// using the mbcnt intrinsic.
- Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
- Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
- Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
- CallInst *const PartialMbcnt = B.CreateIntrinsic(
- Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
- Value *const Mbcnt =
- B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
- {ExtractHi, PartialMbcnt}),
- Ty, false);
+ Value *Mbcnt;
+ if (ST->isWave32()) {
+ Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+ {Ballot, B.getInt32(0)});
+ } else {
+ Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
+ Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
+ Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
+ Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+ {ExtractLo, B.getInt32(0)});
+ Mbcnt =
+ B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
+ }
+ Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
@@ -373,47 +479,25 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
if (ValDivergent) {
// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.
- CallInst *const SetInactive =
- B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-
- CallInst *const FirstDPP =
- B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
- {Identity, SetInactive, B.getInt32(DPP_WF_SR1),
- B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
- ExclScan = FirstDPP;
-
- const unsigned Iters = 7;
- const unsigned DPPCtrl[Iters] = {
- DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4,
- DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31};
- const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
- const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf};
-
- // This loop performs an exclusive scan across the wavefront, with all lanes
- // active (by using the WWM intrinsic).
- for (unsigned Idx = 0; Idx < Iters; Idx++) {
- Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan;
- CallInst *const DPP = B.CreateIntrinsic(
- Intrinsic::amdgcn_update_dpp, Ty,
- {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),
- B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});
-
- ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
- }
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
- NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);
+ const AtomicRMWInst::BinOp ScanOp =
+ Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
+ NewV = buildScan(B, ScanOp, NewV, Identity);
+ ExclScan = buildShiftRight(B, NewV, Identity);
// Read the value from the last lane, which has accumlated the values of
// each active lane in the wavefront. This will be our new value which we
// will provide to the atomic operation.
+ Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
if (TyBitWidth == 64) {
Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
+ B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
CallInst *const ReadLaneLo = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
+ Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
CallInst *const ReadLaneHi = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
+ Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
Value *const PartialInsert = B.CreateInsertElement(
UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
Value *const Insert =
@@ -421,7 +505,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
NewV = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
- {NewV, B.getInt32(63)});
+ {NewV, LastLaneIdx});
} else {
llvm_unreachable("Unhandled atomic bit width");
}
@@ -493,77 +577,80 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// original instruction.
B.SetInsertPoint(&I);
- // Create a PHI node to get our new atomic result into the exit block.
- PHINode *const PHI = B.CreatePHI(Ty, 2);
- PHI->addIncoming(UndefValue::get(Ty), EntryBB);
- PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
-
- // We need to broadcast the value who was the lowest active lane (the first
- // lane) to all other lanes in the wavefront. We use an intrinsic for this,
- // but have to handle 64-bit broadcasts with two calls to this intrinsic.
- Value *BroadcastI = nullptr;
-
- if (TyBitWidth == 64) {
- Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
- Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
- CallInst *const ReadFirstLaneLo =
- B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
- CallInst *const ReadFirstLaneHi =
- B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
- Value *const PartialInsert = B.CreateInsertElement(
- UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
- Value *const Insert =
- B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
- BroadcastI = B.CreateBitCast(Insert, Ty);
- } else if (TyBitWidth == 32) {
-
- BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
- } else {
- llvm_unreachable("Unhandled atomic bit width");
- }
+ const bool NeedResult = !I.use_empty();
+ if (NeedResult) {
+ // Create a PHI node to get our new atomic result into the exit block.
+ PHINode *const PHI = B.CreatePHI(Ty, 2);
+ PHI->addIncoming(UndefValue::get(Ty), EntryBB);
+ PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
- // Now that we have the result of our single atomic operation, we need to
- // get our individual lane's slice into the result. We use the lane offset we
- // previously calculated combined with the atomic result value we got from the
- // first lane, to get our lane's index into the atomic result.
- Value *LaneOffset = nullptr;
- if (ValDivergent) {
- LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
- } else {
- switch (Op) {
- default:
- llvm_unreachable("Unhandled atomic op");
- case AtomicRMWInst::Add:
- case AtomicRMWInst::Sub:
- LaneOffset = B.CreateMul(V, Mbcnt);
- break;
- case AtomicRMWInst::And:
- case AtomicRMWInst::Or:
- case AtomicRMWInst::Max:
- case AtomicRMWInst::Min:
- case AtomicRMWInst::UMax:
- case AtomicRMWInst::UMin:
- LaneOffset = B.CreateSelect(Cond, Identity, V);
- break;
- case AtomicRMWInst::Xor:
- LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
- break;
+ // We need to broadcast the value who was the lowest active lane (the first
+ // lane) to all other lanes in the wavefront. We use an intrinsic for this,
+ // but have to handle 64-bit broadcasts with two calls to this intrinsic.
+ Value *BroadcastI = nullptr;
+
+ if (TyBitWidth == 64) {
+ Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
+ Value *const ExtractHi =
+ B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
+ CallInst *const ReadFirstLaneLo =
+ B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
+ CallInst *const ReadFirstLaneHi =
+ B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
+ Value *const PartialInsert = B.CreateInsertElement(
+ UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
+ Value *const Insert =
+ B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
+ BroadcastI = B.CreateBitCast(Insert, Ty);
+ } else if (TyBitWidth == 32) {
+
+ BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
+ } else {
+ llvm_unreachable("Unhandled atomic bit width");
}
- }
- Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
- if (IsPixelShader) {
- // Need a final PHI to reconverge to above the helper lane branch mask.
- B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+ // Now that we have the result of our single atomic operation, we need to
+ // get our individual lane's slice into the result. We use the lane offset
+ // we previously calculated combined with the atomic result value we got
+ // from the first lane, to get our lane's index into the atomic result.
+ Value *LaneOffset = nullptr;
+ if (ValDivergent) {
+ LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
+ } else {
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ LaneOffset = B.CreateMul(V, Mbcnt);
+ break;
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ LaneOffset = B.CreateSelect(Cond, Identity, V);
+ break;
+ case AtomicRMWInst::Xor:
+ LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
+ break;
+ }
+ }
+ Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
- PHINode *const PHI = B.CreatePHI(Ty, 2);
- PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
- PHI->addIncoming(Result, I.getParent());
- I.replaceAllUsesWith(PHI);
- } else {
- // Replace the original atomic instruction with the new one.
- I.replaceAllUsesWith(Result);
+ if (IsPixelShader) {
+ // Need a final PHI to reconverge to above the helper lane branch mask.
+ B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+
+ PHINode *const PHI = B.CreatePHI(Ty, 2);
+ PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
+ PHI->addIncoming(Result, I.getParent());
+ I.replaceAllUsesWith(PHI);
+ } else {
+ // Replace the original atomic instruction with the new one.
+ I.replaceAllUsesWith(Result);
+ }
}
// And delete the original.
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index b107c357196d..58c44acde1a7 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -30,13 +30,15 @@ using namespace llvm;
namespace {
-struct OutgoingArgHandler : public CallLowering::ValueHandler {
- OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder MIB, CCAssignFn *AssignFn)
- : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+struct OutgoingValueHandler : public CallLowering::ValueHandler {
+ OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+ : ValueHandler(B, MRI, AssignFn), MIB(MIB) {}
MachineInstrBuilder MIB;
+ bool isIncomingArgumentHandler() const override { return false; }
+
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
llvm_unreachable("not implemented");
@@ -49,15 +51,96 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
- MIB.addUse(PhysReg);
- MIRBuilder.buildCopy(PhysReg, ValVReg);
+ Register ExtReg;
+ if (VA.getLocVT().getSizeInBits() < 32) {
+ // 16-bit types are reported as legal for 32-bit registers. We need to
+ // extend and do a 32-bit copy to avoid the verifier complaining about it.
+ ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
+ } else
+ ExtReg = extendRegister(ValVReg, VA);
+
+ MIRBuilder.buildCopy(PhysReg, ExtReg);
+ MIB.addUse(PhysReg, RegState::Implicit);
}
bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info,
+ ISD::ArgFlagsTy Flags,
CCState &State) override {
- return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+ return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+ }
+};
+
+struct IncomingArgHandler : public CallLowering::ValueHandler {
+ uint64_t StackUsed = 0;
+
+ IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ CCAssignFn *AssignFn)
+ : ValueHandler(B, MRI, AssignFn) {}
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override {
+ auto &MFI = MIRBuilder.getMF().getFrameInfo();
+ int FI = MFI.CreateFixedObject(Size, Offset, true);
+ MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+ Register AddrReg = MRI.createGenericVirtualRegister(
+ LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32));
+ MIRBuilder.buildFrameIndex(AddrReg, FI);
+ StackUsed = std::max(StackUsed, Size + Offset);
+ return AddrReg;
+ }
+
+ void assignValueToReg(Register ValVReg, Register PhysReg,
+ CCValAssign &VA) override {
+ markPhysRegUsed(PhysReg);
+
+ if (VA.getLocVT().getSizeInBits() < 32) {
+ // 16-bit types are reported as legal for 32-bit registers. We need to do
+ // a 32-bit copy, and truncate to avoid the verifier complaining about it.
+ auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
+ MIRBuilder.buildTrunc(ValVReg, Copy);
+ return;
+ }
+
+ switch (VA.getLocInfo()) {
+ case CCValAssign::LocInfo::SExt:
+ case CCValAssign::LocInfo::ZExt:
+ case CCValAssign::LocInfo::AExt: {
+ auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
+ MIRBuilder.buildTrunc(ValVReg, Copy);
+ break;
+ }
+ default:
+ MIRBuilder.buildCopy(ValVReg, PhysReg);
+ break;
+ }
+ }
+
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
+ // FIXME: Get alignment
+ auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+ MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 1);
+ MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+ }
+
+ /// How the physical register gets marked varies between formal
+ /// parameters (it's a basic-block live-in), and a call instruction
+ /// (it's an implicit-def of the BL).
+ virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+
+ // FIXME: What is the point of this being a callback?
+ bool isIncomingArgumentHandler() const override { return true; }
+};
+
+struct FormalArgHandler : public IncomingArgHandler {
+ FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ CCAssignFn *AssignFn)
+ : IncomingArgHandler(B, MRI, AssignFn) {}
+
+ void markPhysRegUsed(unsigned PhysReg) override {
+ MIRBuilder.getMBB().addLiveIn(PhysReg);
}
};
@@ -67,55 +150,198 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
: CallLowering(&TLI) {
}
-bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+void AMDGPUCallLowering::splitToValueTypes(
+ const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
+ const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv,
+ SplitArgTy PerformArgSplit) const {
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+ LLVMContext &Ctx = OrigArg.Ty->getContext();
+
+ if (OrigArg.Ty->isVoidTy())
+ return;
+
+ SmallVector<EVT, 4> SplitVTs;
+ ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
+
+ assert(OrigArg.Regs.size() == SplitVTs.size());
+
+ int SplitIdx = 0;
+ for (EVT VT : SplitVTs) {
+ unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
+ Type *Ty = VT.getTypeForEVT(Ctx);
+
+
+
+ if (NumParts == 1) {
+ // No splitting to do, but we want to replace the original type (e.g. [1 x
+ // double] -> double).
+ SplitArgs.emplace_back(OrigArg.Regs[SplitIdx], Ty,
+ OrigArg.Flags, OrigArg.IsFixed);
+
+ ++SplitIdx;
+ continue;
+ }
+
+ LLT LLTy = getLLTForType(*Ty, DL);
+
+ SmallVector<Register, 8> SplitRegs;
+
+ EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
+ Type *PartTy = PartVT.getTypeForEVT(Ctx);
+ LLT PartLLT = getLLTForType(*PartTy, DL);
+
+ // FIXME: Should we be reporting all of the part registers for a single
+ // argument, and let handleAssignments take care of the repacking?
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
+ SplitRegs.push_back(PartReg);
+ SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
+ }
+
+ PerformArgSplit(SplitRegs, LLTy, PartLLT, SplitIdx);
+
+ ++SplitIdx;
+ }
+}
+
+// Get the appropriate type to make \p OrigTy \p Factor times bigger.
+static LLT getMultipleType(LLT OrigTy, int Factor) {
+ if (OrigTy.isVector()) {
+ return LLT::vector(OrigTy.getNumElements() * Factor,
+ OrigTy.getElementType());
+ }
+
+ return LLT::scalar(OrigTy.getSizeInBits() * Factor);
+}
+
+// TODO: Move to generic code
+static void unpackRegsToOrigType(MachineIRBuilder &B,
+ ArrayRef<Register> DstRegs,
+ Register SrcReg,
+ LLT SrcTy,
+ LLT PartTy) {
+ assert(DstRegs.size() > 1 && "Nothing to unpack");
+
+ MachineFunction &MF = B.getMF();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ const unsigned SrcSize = SrcTy.getSizeInBits();
+ const unsigned PartSize = PartTy.getSizeInBits();
+
+ if (SrcTy.isVector() && !PartTy.isVector() &&
+ PartSize > SrcTy.getElementType().getSizeInBits()) {
+ // Vector was scalarized, and the elements extended.
+ auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(),
+ SrcReg);
+ for (int i = 0, e = DstRegs.size(); i != e; ++i)
+ B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
+ return;
+ }
+
+ if (SrcSize % PartSize == 0) {
+ B.buildUnmerge(DstRegs, SrcReg);
+ return;
+ }
+
+ const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize;
+
+ LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
+ auto ImpDef = B.buildUndef(BigTy);
+
+ Register BigReg = MRI.createGenericVirtualRegister(BigTy);
+ B.buildInsert(BigReg, ImpDef.getReg(0), SrcReg, 0).getReg(0);
+
+ int64_t Offset = 0;
+ for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
+ B.buildExtract(DstRegs[i], BigReg, Offset);
+}
+
+/// Lower the return value for the already existing \p Ret. This assumes that
+/// \p B's insertion point is correct.
+bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
+ const Value *Val, ArrayRef<Register> VRegs,
+ MachineInstrBuilder &Ret) const {
+ if (!Val)
+ return true;
+
+ auto &MF = B.getMF();
+ const auto &F = MF.getFunction();
+ const DataLayout &DL = MF.getDataLayout();
+
+ CallingConv::ID CC = F.getCallingConv();
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ ArgInfo OrigRetInfo(VRegs, Val->getType());
+ setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
+ SmallVector<ArgInfo, 4> SplitRetInfos;
+
+ splitToValueTypes(
+ OrigRetInfo, SplitRetInfos, DL, MRI, CC,
+ [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+ unpackRegsToOrigType(B, Regs, VRegs[VTSplitIdx], LLTy, PartLLT);
+ });
+
+ CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
+
+ OutgoingValueHandler RetHandler(B, MF.getRegInfo(), Ret, AssignFn);
+ return handleAssignments(B, SplitRetInfos, RetHandler);
+}
+
+bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
const Value *Val,
ArrayRef<Register> VRegs) const {
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineFunction &MF = B.getMF();
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MFI->setIfReturnsVoid(!Val);
- if (!Val) {
- MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
+ assert(!Val == VRegs.empty() && "Return value without a vreg");
+
+ CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
+ const bool IsShader = AMDGPU::isShader(CC);
+ const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
+ AMDGPU::isKernel(CC);
+ if (IsWaveEnd) {
+ B.buildInstr(AMDGPU::S_ENDPGM)
+ .addImm(0);
return true;
}
- Register VReg = VRegs[0];
-
- const Function &F = MF.getFunction();
- auto &DL = F.getParent()->getDataLayout();
- if (!AMDGPU::isShader(F.getCallingConv()))
- return false;
+ auto const &ST = B.getMF().getSubtarget<GCNSubtarget>();
+ unsigned ReturnOpc =
+ IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
- const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
- SmallVector<EVT, 4> SplitVTs;
- SmallVector<uint64_t, 4> Offsets;
- ArgInfo OrigArg{VReg, Val->getType()};
- setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
- ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
-
- SmallVector<ArgInfo, 8> SplitArgs;
- CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false);
- for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
- Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext());
- SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed});
+ auto Ret = B.buildInstrNoInsert(ReturnOpc);
+ Register ReturnAddrVReg;
+ if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
+ ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
+ Ret.addUse(ReturnAddrVReg);
}
- auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG);
- OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn);
- if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+
+ if (!lowerReturnVal(B, Val, VRegs, Ret))
return false;
- MIRBuilder.insertInstr(RetInstr);
+ if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
+ &AMDGPU::SGPR_64RegClass);
+ B.buildCopy(ReturnAddrVReg, LiveInReturn);
+ }
+
+ // TODO: Handle CalleeSavedRegsViaCopy.
+
+ B.insertInstr(Ret);
return true;
}
-Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
+Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B,
Type *ParamTy,
uint64_t Offset) const {
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineFunction &MF = B.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
@@ -128,79 +354,37 @@ Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
- MIRBuilder.buildConstant(OffsetReg, Offset);
+ B.buildConstant(OffsetReg, Offset);
- MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
+ B.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
return DstReg;
}
-void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
+void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B,
Type *ParamTy, uint64_t Offset,
unsigned Align,
Register DstReg) const {
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineFunction &MF = B.getMF();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
- Register PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
+ Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
MachineMemOperand *MMO =
MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad |
- MachineMemOperand::MONonTemporal |
+ MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
TypeSize, Align);
- MIRBuilder.buildLoad(DstReg, PtrReg, *MMO);
-}
-
-static Register findFirstFreeSGPR(CCState &CCInfo) {
- unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
- for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
- if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
- return AMDGPU::SGPR0 + Reg;
- }
- }
- llvm_unreachable("Cannot allocate sgpr");
-}
-
-static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
- const LLT S32 = LLT::scalar(32);
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
- if (Info.hasWorkItemIDX()) {
- Register Reg = AMDGPU::VGPR0;
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
-
- CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
- }
-
- if (Info.hasWorkItemIDY()) {
- Register Reg = AMDGPU::VGPR1;
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
-
- CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
- }
-
- if (Info.hasWorkItemIDZ()) {
- Register Reg = AMDGPU::VGPR2;
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
-
- CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
- }
+ B.buildLoad(DstReg, PtrReg, *MMO);
}
// Allocate special inputs passed in user SGPRs.
static void allocateHSAUserSGPRs(CCState &CCInfo,
- MachineIRBuilder &MIRBuilder,
+ MachineIRBuilder &B,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
@@ -229,8 +413,8 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
Register VReg = MRI.createGenericVirtualRegister(P4);
MRI.addLiveIn(InputPtrReg, VReg);
- MIRBuilder.getMBB().addLiveIn(InputPtrReg);
- MIRBuilder.buildCopy(VReg, InputPtrReg);
+ B.getMBB().addLiveIn(InputPtrReg);
+ B.buildCopy(VReg, InputPtrReg);
CCInfo.AllocateReg(InputPtrReg);
}
@@ -250,74 +434,22 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
// these from the dispatch pointer.
}
-static void allocateSystemSGPRs(CCState &CCInfo,
- MachineFunction &MF,
- SIMachineFunctionInfo &Info,
- CallingConv::ID CallConv,
- bool IsShader) {
- const LLT S32 = LLT::scalar(32);
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
- if (Info.hasWorkGroupIDX()) {
- Register Reg = Info.addWorkGroupIDX();
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
- CCInfo.AllocateReg(Reg);
- }
-
- if (Info.hasWorkGroupIDY()) {
- Register Reg = Info.addWorkGroupIDY();
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
- CCInfo.AllocateReg(Reg);
- }
-
- if (Info.hasWorkGroupIDZ()) {
- unsigned Reg = Info.addWorkGroupIDZ();
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
- CCInfo.AllocateReg(Reg);
- }
-
- if (Info.hasWorkGroupInfo()) {
- unsigned Reg = Info.addWorkGroupInfo();
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
- CCInfo.AllocateReg(Reg);
- }
-
- if (Info.hasPrivateSegmentWaveByteOffset()) {
- // Scratch wave offset passed in system SGPR.
- unsigned PrivateSegmentWaveByteOffsetReg;
-
- if (IsShader) {
- PrivateSegmentWaveByteOffsetReg =
- Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
-
- // This is true if the scratch wave byte offset doesn't have a fixed
- // location.
- if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
- PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
- Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
- }
- } else
- PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
-
- MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
- CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
- }
-}
-
bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
- MachineIRBuilder &MIRBuilder, const Function &F,
+ MachineIRBuilder &B, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const {
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineFunction &MF = B.getMF();
const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+
const DataLayout &DL = F.getParent()->getDataLayout();
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
- allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info);
+ allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
unsigned i = 0;
const unsigned KernArgBaseAlign = 16;
@@ -343,123 +475,242 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
: MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
- lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, ArgReg);
+ lowerParameter(B, ArgTy, ArgOffset, Align, ArgReg);
if (OrigArgRegs.size() > 1)
- unpackRegs(OrigArgRegs, ArgReg, ArgTy, MIRBuilder);
+ unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
++i;
}
- allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
- allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
+ TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
+ TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
return true;
}
+// TODO: Move this to generic code
+static void packSplitRegsToOrigType(MachineIRBuilder &B,
+ ArrayRef<Register> OrigRegs,
+ ArrayRef<Register> Regs,
+ LLT LLTy,
+ LLT PartLLT) {
+ if (!LLTy.isVector() && !PartLLT.isVector()) {
+ B.buildMerge(OrigRegs[0], Regs);
+ return;
+ }
+
+ if (LLTy.isVector() && PartLLT.isVector()) {
+ assert(LLTy.getElementType() == PartLLT.getElementType());
+
+ int DstElts = LLTy.getNumElements();
+ int PartElts = PartLLT.getNumElements();
+ if (DstElts % PartElts == 0)
+ B.buildConcatVectors(OrigRegs[0], Regs);
+ else {
+ // Deal with v3s16 split into v2s16
+ assert(PartElts == 2 && DstElts % 2 != 0);
+ int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts);
+
+ LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType());
+ auto RoundedConcat = B.buildConcatVectors(RoundedDestTy, Regs);
+ B.buildExtract(OrigRegs[0], RoundedConcat, 0);
+ }
+
+ return;
+ }
+
+ assert(LLTy.isVector() && !PartLLT.isVector());
+
+ LLT DstEltTy = LLTy.getElementType();
+ if (DstEltTy == PartLLT) {
+ // Vector was trivially scalarized.
+ B.buildBuildVector(OrigRegs[0], Regs);
+ } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
+ // Deal with vector with 64-bit elements decomposed to 32-bit
+ // registers. Need to create intermediate 64-bit elements.
+ SmallVector<Register, 8> EltMerges;
+ int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
+
+ assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
+
+ for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) {
+ auto Merge = B.buildMerge(DstEltTy,
+ Regs.take_front(PartsPerElt));
+ EltMerges.push_back(Merge.getReg(0));
+ Regs = Regs.drop_front(PartsPerElt);
+ }
+
+ B.buildBuildVector(OrigRegs[0], EltMerges);
+ } else {
+ // Vector was split, and elements promoted to a wider type.
+ LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
+ auto BV = B.buildBuildVector(BVType, Regs);
+ B.buildTrunc(OrigRegs[0], BV);
+ }
+}
+
bool AMDGPUCallLowering::lowerFormalArguments(
- MachineIRBuilder &MIRBuilder, const Function &F,
+ MachineIRBuilder &B, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const {
+ CallingConv::ID CC = F.getCallingConv();
+
// The infrastructure for normal calling convention lowering is essentially
// useless for kernels. We want to avoid any kind of legalization or argument
// splitting.
- if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
- return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs);
+ if (CC == CallingConv::AMDGPU_KERNEL)
+ return lowerFormalArgumentsKernel(B, F, VRegs);
- // AMDGPU_GS and AMDGP_HS are not supported yet.
- if (F.getCallingConv() == CallingConv::AMDGPU_GS ||
- F.getCallingConv() == CallingConv::AMDGPU_HS)
- return false;
+ const bool IsShader = AMDGPU::isShader(CC);
+ const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineFunction &MF = B.getMF();
+ MachineBasicBlock &MBB = B.getMBB();
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+ const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
const DataLayout &DL = F.getParent()->getDataLayout();
- bool IsShader = AMDGPU::isShader(F.getCallingConv());
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+ CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
+
+ if (!IsEntryFunc) {
+ Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
+ Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
+ &AMDGPU::SGPR_64RegClass);
+ MBB.addLiveIn(ReturnAddrReg);
+ B.buildCopy(LiveInReturn, ReturnAddrReg);
+ }
if (Info->hasImplicitBufferPtr()) {
- unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
+ Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
- unsigned NumArgs = F.arg_size();
- Function::const_arg_iterator CurOrigArg = F.arg_begin();
- const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
+
+ SmallVector<ArgInfo, 32> SplitArgs;
+ unsigned Idx = 0;
unsigned PSInputNum = 0;
- BitVector Skipped(NumArgs);
- for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
- EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType());
-
- // We can only hanlde simple value types at the moment.
- ISD::ArgFlagsTy Flags;
- assert(VRegs[i].size() == 1 && "Can't lower into more than one register");
- ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()};
- setArgFlags(OrigArg, i + 1, DL, F);
- Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
-
- if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
- !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() &&
- PSInputNum <= 15) {
- if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) {
- Skipped.set(i);
- ++PSInputNum;
+
+ for (auto &Arg : F.args()) {
+ if (DL.getTypeStoreSize(Arg.getType()) == 0)
+ continue;
+
+ const bool InReg = Arg.hasAttribute(Attribute::InReg);
+
+ // SGPR arguments to functions not implemented.
+ if (!IsShader && InReg)
+ return false;
+
+ if (Arg.hasAttribute(Attribute::SwiftSelf) ||
+ Arg.hasAttribute(Attribute::SwiftError) ||
+ Arg.hasAttribute(Attribute::Nest))
+ return false;
+
+ if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
+ const bool ArgUsed = !Arg.use_empty();
+ bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
+
+ if (!SkipArg) {
+ Info->markPSInputAllocated(PSInputNum);
+ if (ArgUsed)
+ Info->markPSInputEnabled(PSInputNum);
+ }
+
+ ++PSInputNum;
+
+ if (SkipArg) {
+ for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
+ B.buildUndef(VRegs[Idx][I]);
+
+ ++Idx;
continue;
}
+ }
- Info->markPSInputAllocated(PSInputNum);
- if (!CurOrigArg->use_empty())
- Info->markPSInputEnabled(PSInputNum);
+ ArgInfo OrigArg(VRegs[Idx], Arg.getType());
+ setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
- ++PSInputNum;
+ splitToValueTypes(
+ OrigArg, SplitArgs, DL, MRI, CC,
+ // FIXME: We should probably be passing multiple registers to
+ // handleAssignments to do this
+ [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+ packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
+ LLTy, PartLLT);
+ });
+
+ ++Idx;
+ }
+
+ // At least one interpolation mode must be enabled or else the GPU will
+ // hang.
+ //
+ // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
+ // set PSInputAddr, the user wants to enable some bits after the compilation
+ // based on run-time states. Since we can't know what the final PSInputEna
+ // will look like, so we shouldn't do anything here and the user should take
+ // responsibility for the correct programming.
+ //
+ // Otherwise, the following restrictions apply:
+ // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
+ // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
+ // enabled too.
+ if (CC == CallingConv::AMDGPU_PS) {
+ if ((Info->getPSInputAddr() & 0x7F) == 0 ||
+ ((Info->getPSInputAddr() & 0xF) == 0 &&
+ Info->isPSInputAllocated(11))) {
+ CCInfo.AllocateReg(AMDGPU::VGPR0);
+ CCInfo.AllocateReg(AMDGPU::VGPR1);
+ Info->markPSInputAllocated(0);
+ Info->markPSInputEnabled(0);
}
- CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(),
- /*IsVarArg=*/false);
-
- if (ValEVT.isVector()) {
- EVT ElemVT = ValEVT.getVectorElementType();
- if (!ValEVT.isSimple())
- return false;
- MVT ValVT = ElemVT.getSimpleVT();
- bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full,
- OrigArg.Flags, CCInfo);
- if (!Res)
- return false;
- } else {
- MVT ValVT = ValEVT.getSimpleVT();
- if (!ValEVT.isSimple())
- return false;
- bool Res =
- AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
-
- // Fail if we don't know how to handle this type.
- if (Res)
- return false;
+ if (Subtarget.isAmdPalOS()) {
+ // For isAmdPalOS, the user does not enable some bits after compilation
+ // based on run-time states; the register values being generated here are
+ // the final ones set in hardware. Therefore we need to apply the
+ // workaround to PSInputAddr and PSInputEnable together. (The case where
+ // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
+ // set up an input arg for a particular interpolation mode, but nothing
+ // uses that input arg. Really we should have an earlier pass that removes
+ // such an arg.)
+ unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
+ if ((PsInputBits & 0x7F) == 0 ||
+ ((PsInputBits & 0xF) == 0 &&
+ (PsInputBits >> 11 & 1)))
+ Info->markPSInputEnabled(
+ countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
}
}
- Function::const_arg_iterator Arg = F.arg_begin();
-
- if (F.getCallingConv() == CallingConv::AMDGPU_VS ||
- F.getCallingConv() == CallingConv::AMDGPU_PS) {
- for (unsigned i = 0, OrigArgIdx = 0;
- OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) {
- if (Skipped.test(OrigArgIdx))
- continue;
- assert(VRegs[OrigArgIdx].size() == 1 &&
- "Can't lower into more than 1 reg");
- CCValAssign &VA = ArgLocs[i++];
- MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]);
- MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
- MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg());
- }
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+ CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
- allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader);
- return true;
+ if (!MBB.empty())
+ B.setInstr(*MBB.begin());
+
+ FormalArgHandler Handler(B, MRI, AssignFn);
+ if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
+ return false;
+
+ if (!IsEntryFunc) {
+ // Special inputs come after user arguments.
+ TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
+ }
+
+ // Start adding system SGPRs.
+ if (IsEntryFunc) {
+ TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
+ } else {
+ CCInfo.AllocateReg(Info->getScratchRSrcReg());
+ CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
+ CCInfo.AllocateReg(Info->getFrameOffsetReg());
+ TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
- return false;
+ // Move back to the end of the basic block.
+ B.setMBB(MBB);
+
+ return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 3599659cac6a..53a562586bc0 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -20,26 +20,37 @@
namespace llvm {
class AMDGPUTargetLowering;
+class MachineInstrBuilder;
class AMDGPUCallLowering: public CallLowering {
- Register lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
+ Register lowerParameterPtr(MachineIRBuilder &B, Type *ParamTy,
uint64_t Offset) const;
- void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
- uint64_t Offset, unsigned Align,
- Register DstReg) const;
+ void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset,
+ unsigned Align, Register DstReg) const;
- public:
+ /// A function of this type is used to perform value split action.
+ using SplitArgTy = std::function<void(ArrayRef<Register>, LLT, LLT, int)>;
+
+ void splitToValueTypes(const ArgInfo &OrigArgInfo,
+ SmallVectorImpl<ArgInfo> &SplitArgs,
+ const DataLayout &DL, MachineRegisterInfo &MRI,
+ CallingConv::ID CallConv,
+ SplitArgTy SplitArg) const;
+
+ bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
+ ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
+
+public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
- bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
+ bool lowerReturn(MachineIRBuilder &B, const Value *Val,
ArrayRef<Register> VRegs) const override;
- bool lowerFormalArgumentsKernel(MachineIRBuilder &MIRBuilder,
- const Function &F,
+ bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const;
- bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ bool lowerFormalArguments(MachineIRBuilder &B, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const override;
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 3688cd77542e..f8a54a61aac2 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -24,22 +24,9 @@ def CC_SI : CallingConv<[
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
- SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47,
- SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55,
- SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63,
- SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71,
- SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79,
- SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87,
- SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95,
- SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103,
- SGPR104, SGPR105
+ SGPR40, SGPR41, SGPR42, SGPR43
]>>>,
- // We have no way of referring to the generated register tuples
- // here, so use a custom function.
- CCIfInReg<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
- CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
-
// 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
@@ -69,15 +56,7 @@ def RetCC_SI_Shader : CallingConv<[
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
- SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47,
- SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55,
- SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63,
- SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71,
- SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79,
- SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87,
- SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95,
- SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103,
- SGPR104, SGPR105
+ SGPR40, SGPR41, SGPR42, SGPR43
]>>,
// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
@@ -138,7 +117,6 @@ def CC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v5i32, v5f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
@@ -157,7 +135,6 @@ def RetCC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>
]>;
def CC_AMDGPU : CallingConv<[
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index b750c6b5f6d2..1640a4a59ee2 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -55,6 +55,12 @@ static cl::opt<bool> WidenLoads(
cl::ReallyHidden,
cl::init(true));
+static cl::opt<bool> UseMul24Intrin(
+ "amdgpu-codegenprepare-mul24",
+ cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden,
+ cl::init(true));
+
class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {
const GCNSubtarget *ST = nullptr;
@@ -509,7 +515,9 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
}
}
- I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals));
+ Value *NewVal = insertValues(Builder, Ty, ResultVals);
+ NewVal->takeName(&I);
+ I.replaceAllUsesWith(NewVal);
I.eraseFromParent();
return true;
@@ -879,7 +887,7 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
DA->isUniform(&I) && promoteUniformOpToI32(I))
return true;
- if (replaceMulWithMul24(I))
+ if (UseMul24Intrin && replaceMulWithMul24(I))
return true;
bool Changed = false;
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index e80797736363..61ce83b30e00 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -13,9 +13,9 @@
#include "AMDGPUFrameLowering.h"
using namespace llvm;
-AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
- int LAO, unsigned TransAl)
- : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
+AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, Align StackAl,
+ int LAO, Align TransAl)
+ : TargetFrameLowering(D, StackAl, LAO, TransAl) {}
AMDGPUFrameLowering::~AMDGPUFrameLowering() = default;
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 48b64488303e..92e256cf2829 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -25,8 +25,8 @@ namespace llvm {
/// See TargetFrameInfo for more comments.
class AMDGPUFrameLowering : public TargetFrameLowering {
public:
- AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
- unsigned TransAl = 1);
+ AMDGPUFrameLowering(StackDirection D, Align StackAl, int LAO,
+ Align TransAl = Align::None());
~AMDGPUFrameLowering() override;
/// \returns The number of 32-bit sub-registers that are used when storing
diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td
index cad4c2ef404c..f2be1ca44d34 100644
--- a/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -12,10 +12,6 @@
include "AMDGPU.td"
-def p0 : PtrValueType<i64, 0>;
-def p1 : PtrValueType<i64, 1>;
-def p4 : PtrValueType<i64, 4>;
-
def sd_vsrc0 : ComplexPattern<i32, 1, "">;
def gi_vsrc0 :
GIComplexOperandMatcher<s32, "selectVSRC0">,
@@ -38,6 +34,18 @@ def gi_vop3omods :
GIComplexOperandMatcher<s32, "selectVOP3OMods">,
GIComplexPatternEquiv<VOP3OMods>;
+def gi_vop3omods0clamp0omod :
+ GIComplexOperandMatcher<s32, "selectVOP3Mods0Clamp0OMod">,
+ GIComplexPatternEquiv<VOP3Mods0Clamp0OMod>;
+
+def gi_vop3opselmods0 :
+ GIComplexOperandMatcher<s32, "selectVOP3OpSelMods0">,
+ GIComplexPatternEquiv<VOP3OpSelMods0>;
+
+def gi_vop3opselmods :
+ GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
+ GIComplexPatternEquiv<VOP3OpSelMods>;
+
def gi_smrd_imm :
GIComplexOperandMatcher<s64, "selectSmrdImm">,
GIComplexPatternEquiv<SMRDImm>;
@@ -50,12 +58,19 @@ def gi_smrd_sgpr :
GIComplexOperandMatcher<s64, "selectSmrdSgpr">,
GIComplexPatternEquiv<SMRDSgpr>;
+// FIXME: Why are the atomic versions separated?
def gi_flat_offset :
GIComplexOperandMatcher<s64, "selectFlatOffset">,
GIComplexPatternEquiv<FLATOffset>;
def gi_flat_offset_signed :
GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
GIComplexPatternEquiv<FLATOffsetSigned>;
+def gi_flat_atomic :
+ GIComplexOperandMatcher<s64, "selectFlatOffset">,
+ GIComplexPatternEquiv<FLATAtomic>;
+def gi_flat_signed_atomic :
+ GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
+ GIComplexPatternEquiv<FLATSignedAtomic>;
def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
@@ -64,6 +79,44 @@ def gi_mubuf_scratch_offen :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffen">,
GIComplexPatternEquiv<MUBUFScratchOffen>;
+def gi_ds_1addr_1offset :
+ GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
+ GIComplexPatternEquiv<DS1Addr1Offset>;
+
+
+// Separate load nodes are defined to glue m0 initialization in
+// SelectionDAG. The GISel selector can just insert m0 initialization
+// directly before before selecting a glue-less load, so hide this
+// distinction.
+
+def : GINodeEquiv<G_LOAD, AMDGPUld_glue> {
+ let CheckMMOIsNonAtomic = 1;
+}
+
+def : GINodeEquiv<G_STORE, AMDGPUst_glue> {
+ let CheckMMOIsNonAtomic = 1;
+}
+
+def : GINodeEquiv<G_LOAD, AMDGPUatomic_ld_glue> {
+ bit CheckMMOIsAtomic = 1;
+}
+
+
+
+def : GINodeEquiv<G_ATOMIC_CMPXCHG, atomic_cmp_swap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_XCHG, atomic_swap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_ADD, atomic_load_add_glue>;
+def : GINodeEquiv<G_ATOMICRMW_SUB, atomic_load_sub_glue>;
+def : GINodeEquiv<G_ATOMICRMW_AND, atomic_load_and_glue>;
+def : GINodeEquiv<G_ATOMICRMW_OR, atomic_load_or_glue>;
+def : GINodeEquiv<G_ATOMICRMW_XOR, atomic_load_xor_glue>;
+def : GINodeEquiv<G_ATOMICRMW_MIN, atomic_load_min_glue>;
+def : GINodeEquiv<G_ATOMICRMW_MAX, atomic_load_max_glue>;
+def : GINodeEquiv<G_ATOMICRMW_UMIN, atomic_load_umin_glue>;
+def : GINodeEquiv<G_ATOMICRMW_UMAX, atomic_load_umax_glue>;
+def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd_glue>;
+
+def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32>;
class GISelSop2Pat <
SDPatternOperator node,
@@ -143,20 +196,6 @@ multiclass GISelVop2IntrPat <
def : GISelSop2Pat <or, S_OR_B32, i32>;
def : GISelVop2Pat <or, V_OR_B32_e32, i32>;
-// FIXME: We can't re-use SelectionDAG patterns here because they match
-// against a custom SDNode and we would need to create a generic machine
-// instruction that is equivalent to the custom SDNode. This would also require
-// us to custom legalize the intrinsic to the new generic machine instruction,
-// but I can't get custom legalizing of intrinsic to work and I'm not sure if
-// this is even supported yet.
-def : GISelVop3Pat2ModsPat <
- int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e64, v2f16, f32>;
-
-defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>;
-def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>;
-defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>;
-def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>;
-
// Since GlobalISel is more flexible then SelectionDAG, I think we can get
// away with adding patterns for integer types and not legalizing all
// loads and stores to vector types. This should help simplify the load/store
@@ -164,3 +203,6 @@ def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>;
foreach Ty = [i64, p0, p1, p4] in {
defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>;
}
+
+def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">,
+ GISDNodeXFormEquiv<as_i32timm>;
diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 0a1f48231b18..85d1ad349157 100644
--- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -22,15 +22,17 @@ enum PartialMappingIdx {
PM_SGPR128 = 9,
PM_SGPR256 = 10,
PM_SGPR512 = 11,
- PM_VGPR1 = 12,
- PM_VGPR16 = 16,
- PM_VGPR32 = 17,
- PM_VGPR64 = 18,
- PM_VGPR128 = 19,
- PM_VGPR256 = 20,
- PM_VGPR512 = 21,
- PM_SGPR96 = 22,
- PM_VGPR96 = 23
+ PM_SGPR1024 = 12,
+ PM_VGPR1 = 13,
+ PM_VGPR16 = 17,
+ PM_VGPR32 = 18,
+ PM_VGPR64 = 19,
+ PM_VGPR128 = 20,
+ PM_VGPR256 = 21,
+ PM_VGPR512 = 22,
+ PM_VGPR1024 = 23,
+ PM_SGPR96 = 24,
+ PM_VGPR96 = 25
};
const RegisterBankInfo::PartialMapping PartMappings[] {
@@ -45,6 +47,7 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
{0, 128, SGPRRegBank},
{0, 256, SGPRRegBank},
{0, 512, SGPRRegBank},
+ {0, 1024, SGPRRegBank},
{0, 1, VGPRRegBank}, // VGPR begin
{0, 16, VGPRRegBank},
@@ -53,8 +56,9 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
{0, 128, VGPRRegBank},
{0, 256, VGPRRegBank},
{0, 512, VGPRRegBank},
+ {0, 1024, VGPRRegBank},
{0, 96, SGPRRegBank},
- {0, 96, VGPRRegBank},
+ {0, 96, VGPRRegBank}
};
const RegisterBankInfo::ValueMapping ValMappings[] {
@@ -65,41 +69,43 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
{&PartMappings[1], 1},
// SGPRs
- {&PartMappings[2], 1},
+ {&PartMappings[2], 1}, // 1
{nullptr, 0}, // Illegal power of 2 sizes
{nullptr, 0},
{nullptr, 0},
- {&PartMappings[3], 1},
- {&PartMappings[4], 1},
- {&PartMappings[5], 1},
- {&PartMappings[6], 1},
- {&PartMappings[7], 1},
- {&PartMappings[8], 1},
-
- // VGPRs
- {&PartMappings[9], 1},
+ {&PartMappings[3], 1}, // 16
+ {&PartMappings[4], 1}, // 32
+ {&PartMappings[5], 1}, // 64
+ {&PartMappings[6], 1}, // 128
+ {&PartMappings[7], 1}, // 256
+ {&PartMappings[8], 1}, // 512
+ {&PartMappings[9], 1}, // 1024
+
+ // VGPRs
+ {&PartMappings[10], 1}, // 1
{nullptr, 0},
{nullptr, 0},
{nullptr, 0},
- {&PartMappings[10], 1},
- {&PartMappings[11], 1},
- {&PartMappings[12], 1},
- {&PartMappings[13], 1},
- {&PartMappings[14], 1},
- {&PartMappings[15], 1},
- {&PartMappings[16], 1},
- {&PartMappings[17], 1}
+ {&PartMappings[11], 1}, // 16
+ {&PartMappings[12], 1}, // 32
+ {&PartMappings[13], 1}, // 64
+ {&PartMappings[14], 1}, // 128
+ {&PartMappings[15], 1}, // 256
+ {&PartMappings[16], 1}, // 512
+ {&PartMappings[17], 1}, // 1024
+ {&PartMappings[18], 1},
+ {&PartMappings[19], 1}
};
const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
- /*32-bit op*/ {0, 32, SGPRRegBank},
- /*2x32-bit op*/ {0, 32, SGPRRegBank},
- {32, 32, SGPRRegBank},
-/*<2x32-bit> op*/ {0, 64, SGPRRegBank},
-
- /*32-bit op*/ {0, 32, VGPRRegBank},
- /*2x32-bit op*/ {0, 32, VGPRRegBank},
- {32, 32, VGPRRegBank},
+ {0, 32, SGPRRegBank}, // 32-bit op
+ {0, 32, SGPRRegBank}, // 2x32-bit op
+ {32, 32, SGPRRegBank},
+ {0, 64, SGPRRegBank}, // <2x32-bit> op
+
+ {0, 32, VGPRRegBank}, // 32-bit op
+ {0, 32, VGPRRegBank}, // 2x32-bit op
+ {32, 32, VGPRRegBank},
};
@@ -116,7 +122,7 @@ const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] {
enum ValueMappingIdx {
SCCStartIdx = 0,
SGPRStartIdx = 2,
- VGPRStartIdx = 12
+ VGPRStartIdx = 13
};
const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b31de0af5018..9f5bcd8ff5f0 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -218,12 +218,13 @@ MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF,
assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
F.getCallingConv() == CallingConv::SPIR_KERNEL);
- unsigned MaxKernArgAlign;
+ Align MaxKernArgAlign;
HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F,
MaxKernArgAlign);
HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
- HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u);
+ HSACodeProps.mKernargSegmentAlign =
+ std::max(MaxKernArgAlign, Align(4)).value();
HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;
HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;
@@ -883,7 +884,7 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
auto Kern = HSAMetadataDoc->getMapNode();
- unsigned MaxKernArgAlign;
+ Align MaxKernArgAlign;
Kern[".kernarg_segment_size"] = Kern.getDocument()->getNode(
STM.getKernArgSegmentSize(F, MaxKernArgAlign));
Kern[".group_segment_fixed_size"] =
@@ -891,7 +892,7 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
Kern[".private_segment_fixed_size"] =
Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
Kern[".kernarg_segment_align"] =
- Kern.getDocument()->getNode(std::max(uint32_t(4), MaxKernArgAlign));
+ Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
Kern[".wavefront_size"] =
Kern.getDocument()->getNode(STM.getWavefrontSize());
Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 2eecddbd7b01..80ac8ca67bcd 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -52,7 +52,7 @@ public:
class MetadataStreamerV3 final : public MetadataStreamer {
private:
std::unique_ptr<msgpack::Document> HSAMetadataDoc =
- llvm::make_unique<msgpack::Document>();
+ std::make_unique<msgpack::Document>();
void dump(StringRef HSAMetadataString) const;
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ea730539f834..f330bd7ebcdd 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -172,8 +172,9 @@ private:
MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
- SDNode *glueCopyToM0LDSInit(SDNode *N) const;
+ SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
+ SDNode *glueCopyToM0LDSInit(SDNode *N) const;
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
@@ -186,10 +187,11 @@ private:
bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC) const;
+ SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &GLC,
- SDValue &SLC, SDValue &TFE, SDValue &DLC) const;
+ SDValue &SLC, SDValue &TFE, SDValue &DLC,
+ SDValue &SWZ) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
SDValue &SLC) const;
@@ -202,21 +204,20 @@ private:
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
SDValue &Offset, SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC) const;
+ SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset, SDValue &SLC) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
+ template <bool IsSigned>
+ bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
+ SDValue &Offset, SDValue &SLC) const;
bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
- template <bool IsSigned>
- bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
- SDValue &Offset, SDValue &SLC) const;
-
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
bool &Imm) const;
SDValue Expand32BitAddress(SDValue Addr) const;
@@ -262,6 +263,8 @@ private:
SDValue getHi16Elt(SDValue In) const;
+ SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
+
void SelectADD_SUB_I64(SDNode *N);
void SelectAddcSubb(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);
@@ -282,6 +285,7 @@ private:
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
+ void SelectINTRINSIC_WO_CHAIN(SDNode *N);
void SelectINTRINSIC_VOID(SDNode *N);
protected:
@@ -543,7 +547,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
if (!N->isMachineOpcode()) {
if (N->getOpcode() == ISD::CopyToReg) {
unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
return MRI.getRegClass(Reg);
}
@@ -582,19 +586,10 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
}
}
-SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
- const SITargetLowering& Lowering =
- *static_cast<const SITargetLowering*>(getTargetLowering());
-
- assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
-
- SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N),
- Val);
-
- SDValue Glue = M0.getValue(1);
-
+SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
+ SDValue Glue) const {
SmallVector <SDValue, 8> Ops;
- Ops.push_back(M0); // Replace the chain.
+ Ops.push_back(NewChain); // Replace the chain.
for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
Ops.push_back(N->getOperand(i));
@@ -602,6 +597,16 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
}
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
+
+ assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
+
+ SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
+ return glueCopyToOp(N, M0, M0.getValue(1));
+}
+
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
if (AS == AMDGPUAS::LOCAL_ADDRESS) {
@@ -635,13 +640,13 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
switch (NumVectorElts) {
case 1:
- return AMDGPU::SReg_32_XM0RegClassID;
+ return AMDGPU::SReg_32RegClassID;
case 2:
return AMDGPU::SReg_64RegClassID;
case 3:
return AMDGPU::SGPR_96RegClassID;
case 4:
- return AMDGPU::SReg_128RegClassID;
+ return AMDGPU::SGPR_128RegClassID;
case 5:
return AMDGPU::SGPR_160RegClassID;
case 8:
@@ -713,12 +718,17 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
return; // Already selected.
}
- if (isa<AtomicSDNode>(N) ||
+ // isa<MemSDNode> almost works but is slightly too permissive for some DS
+ // intrinsics.
+ if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
(Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
Opc == ISD::ATOMIC_LOAD_FADD ||
Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
+ Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
N = glueCopyToM0LDSInit(N);
+ SelectCode(N);
+ return;
+ }
switch (Opc) {
default:
@@ -781,7 +791,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SDValue RC, SubReg0, SubReg1;
SDLoc DL(N);
if (N->getValueType(0) == MVT::i128) {
- RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32);
+ RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
} else if (N->getValueType(0) == MVT::i64) {
@@ -815,14 +825,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
return;
}
- case ISD::LOAD:
- case ISD::STORE:
- case ISD::ATOMIC_LOAD:
- case ISD::ATOMIC_STORE: {
- N = glueCopyToM0LDSInit(N);
- break;
- }
-
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
// There is a scalar version available, but unlike the vector version which
@@ -908,6 +910,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectINTRINSIC_W_CHAIN(N);
return;
}
+ case ISD::INTRINSIC_WO_CHAIN: {
+ SelectINTRINSIC_WO_CHAIN(N);
+ return;
+ }
case ISD::INTRINSIC_VOID: {
SelectINTRINSIC_VOID(N);
return;
@@ -961,6 +967,14 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
return true;
}
+SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
+ const SDLoc &DL) const {
+ SDNode *Mov = CurDAG->getMachineNode(
+ AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getTargetConstant(Val, DL, MVT::i32));
+ return SDValue(Mov, 0);
+}
+
// FIXME: Should only handle addcarry/subcarry
void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
SDLoc DL(N);
@@ -1308,7 +1322,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64,
SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC) const {
+ SDValue &TFE, SDValue &DLC,
+ SDValue &SWZ) const {
// Subtarget prefers to use flat instruction
if (Subtarget->useFlatForGlobal())
return false;
@@ -1321,6 +1336,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
DLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1);
Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -1400,7 +1416,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset,
SDValue &Offset, SDValue &GLC,
SDValue &SLC, SDValue &TFE,
- SDValue &DLC) const {
+ SDValue &DLC, SDValue &SWZ) const {
SDValue Ptr, Offen, Idxen, Addr64;
// addr64 bit was removed for volcanic islands.
@@ -1408,7 +1424,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
return false;
if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE, DLC))
+ GLC, SLC, TFE, DLC, SWZ))
return false;
ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
@@ -1430,9 +1446,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &Offset,
SDValue &SLC) const {
SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
- SDValue GLC, TFE, DLC;
+ SDValue GLC, TFE, DLC, SWZ;
- return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC);
+ return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ);
}
static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
@@ -1557,13 +1573,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &SOffset, SDValue &Offset,
SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC) const {
+ SDValue &TFE, SDValue &DLC,
+ SDValue &SWZ) const {
SDValue Ptr, VAddr, Offen, Idxen, Addr64;
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE, DLC))
+ GLC, SLC, TFE, DLC, SWZ))
return false;
if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
@@ -1585,16 +1602,30 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &Soffset, SDValue &Offset
) const {
- SDValue GLC, SLC, TFE, DLC;
+ SDValue GLC, SLC, TFE, DLC, SWZ;
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC);
+ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
}
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &Soffset, SDValue &Offset,
SDValue &SLC) const {
- SDValue GLC, TFE, DLC;
+ SDValue GLC, TFE, DLC, SWZ;
+
+ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
+}
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC);
+// Find a load or store from corresponding pattern root.
+// Roots may be build_vector, bitconvert or their combinations.
+static MemSDNode* findMemSDNode(SDNode *N) {
+ N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
+ if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
+ return MN;
+ assert(isa<BuildVectorSDNode>(N));
+ for (SDValue V : N->op_values())
+ if (MemSDNode *MN =
+ dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
+ return MN;
+ llvm_unreachable("cannot find MemSDNode in the pattern!");
}
template <bool IsSigned>
@@ -1603,8 +1634,95 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
SDValue &VAddr,
SDValue &Offset,
SDValue &SLC) const {
- return static_cast<const SITargetLowering*>(getTargetLowering())->
- SelectFlatOffset(IsSigned, *CurDAG, N, Addr, VAddr, Offset, SLC);
+ int64_t OffsetVal = 0;
+
+ if (Subtarget->hasFlatInstOffsets() &&
+ (!Subtarget->hasFlatSegmentOffsetBug() ||
+ findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
+ CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ unsigned AS = findMemSDNode(N)->getAddressSpace();
+ if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
+ Addr = N0;
+ OffsetVal = COffsetVal;
+ } else {
+ // If the offset doesn't fit, put the low bits into the offset field and
+ // add the rest.
+
+ SDLoc DL(N);
+ uint64_t ImmField;
+ const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
+ if (IsSigned) {
+ ImmField = SignExtend64(COffsetVal, NumBits);
+
+ // Don't use a negative offset field if the base offset is positive.
+ // Since the scheduler currently relies on the offset field, doing so
+ // could result in strange scheduling decisions.
+
+ // TODO: Should we not do this in the opposite direction as well?
+ if (static_cast<int64_t>(COffsetVal) > 0) {
+ if (static_cast<int64_t>(ImmField) < 0) {
+ const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1);
+ ImmField = COffsetVal & OffsetMask;
+ }
+ }
+ } else {
+ // TODO: Should we do this for a negative offset?
+ const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
+ ImmField = COffsetVal & OffsetMask;
+ }
+
+ uint64_t RemainderOffset = COffsetVal - ImmField;
+
+ assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
+ assert(RemainderOffset + ImmField == COffsetVal);
+
+ OffsetVal = ImmField;
+
+ // TODO: Should this try to use a scalar add pseudo if the base address is
+ // uniform and saddr is usable?
+ SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+ SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, N0, Sub0);
+ SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, N0, Sub1);
+
+ SDValue AddOffsetLo
+ = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+ SDValue AddOffsetHi
+ = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+ SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+ SDNode *Add = CurDAG->getMachineNode(
+ AMDGPU::V_ADD_I32_e64, DL, VTs,
+ {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+
+ SDNode *Addc = CurDAG->getMachineNode(
+ AMDGPU::V_ADDC_U32_e64, DL, VTs,
+ {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+
+ SDValue RegSequenceArgs[] = {
+ CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
+ SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1
+ };
+
+ Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::i64, RegSequenceArgs), 0);
+ }
+ }
+
+ VAddr = Addr;
+ Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
+ SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N,
@@ -1616,10 +1734,10 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N,
}
bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N,
- SDValue Addr,
- SDValue &VAddr,
- SDValue &Offset,
- SDValue &SLC) const {
+ SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset,
+ SDValue &SLC) const {
return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC);
}
@@ -2158,10 +2276,12 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
// offset field) % 64. Some versions of the programming guide omit the m0
// part, or claim it's from offset 0.
if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
- // If we have a constant offset, try to use the default value for m0 as a
- // base to possibly avoid setting it up.
- glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32));
- ImmOffset = ConstOffset->getZExtValue() + 1;
+ // If we have a constant offset, try to use the 0 in m0 as the base.
+ // TODO: Look into changing the default m0 initialization value. If the
+ // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
+ // the immediate offset.
+ glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
+ ImmOffset = ConstOffset->getZExtValue();
} else {
if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
ImmOffset = BaseOffset.getConstantOperandVal(1);
@@ -2182,22 +2302,7 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
glueCopyToM0(N, SDValue(M0Base, 0));
}
- SDValue V0;
SDValue Chain = N->getOperand(0);
- SDValue Glue;
- if (HasVSrc) {
- SDValue VSrc0 = N->getOperand(2);
-
- // The manual doesn't mention this, but it seems only v0 works.
- V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32);
-
- SDValue CopyToV0 = CurDAG->getCopyToReg(
- N->getOperand(0), SL, V0, VSrc0,
- N->getOperand(N->getNumOperands() - 1));
- Chain = CopyToV0;
- Glue = CopyToV0.getValue(1);
- }
-
SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
// TODO: Can this just be removed from the instruction?
@@ -2206,14 +2311,11 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
const unsigned Opc = gwsIntrinToOpcode(IntrID);
SmallVector<SDValue, 5> Ops;
if (HasVSrc)
- Ops.push_back(V0);
+ Ops.push_back(N->getOperand(2));
Ops.push_back(OffsetField);
Ops.push_back(GDS);
Ops.push_back(Chain);
- if (HasVSrc)
- Ops.push_back(Glue);
-
SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}
@@ -2233,6 +2335,28 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
SelectCode(N);
}
+void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
+ unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ unsigned Opcode;
+ switch (IntrID) {
+ case Intrinsic::amdgcn_wqm:
+ Opcode = AMDGPU::WQM;
+ break;
+ case Intrinsic::amdgcn_softwqm:
+ Opcode = AMDGPU::SOFT_WQM;
+ break;
+ case Intrinsic::amdgcn_wwm:
+ Opcode = AMDGPU::WWM;
+ break;
+ default:
+ SelectCode(N);
+ return;
+ }
+
+ SDValue Src = N->getOperand(1);
+ CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
+}
+
void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntrID) {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 39016ed37193..1115d8c23620 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -12,10 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f
-#define AMDGPU_LN2_F 0.693147180559945309417232121458176568f
-#define AMDGPU_LN10_F 2.30258509299404568401799145468436421f
-
#include "AMDGPUISelLowering.h"
#include "AMDGPU.h"
#include "AMDGPUCallLowering.h"
@@ -37,82 +33,9 @@
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
using namespace llvm;
-static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State,
- const TargetRegisterClass *RC,
- unsigned NumRegs) {
- ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
- unsigned RegResult = State.AllocateReg(RegList);
- if (RegResult == AMDGPU::NoRegister)
- return false;
-
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
- return true;
-}
-
-static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- switch (LocVT.SimpleTy) {
- case MVT::i64:
- case MVT::f64:
- case MVT::v2i32:
- case MVT::v2f32:
- case MVT::v4i16:
- case MVT::v4f16: {
- // Up to SGPR0-SGPR105
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::SGPR_64RegClass, 53);
- }
- default:
- return false;
- }
-}
-
-// Allocate up to VGPR31.
-//
-// TODO: Since there are no VGPR alignent requirements would it be better to
-// split into individual scalar registers?
-static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- switch (LocVT.SimpleTy) {
- case MVT::i64:
- case MVT::f64:
- case MVT::v2i32:
- case MVT::v2f32:
- case MVT::v4i16:
- case MVT::v4f16: {
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::VReg_64RegClass, 31);
- }
- case MVT::v4i32:
- case MVT::v4f32:
- case MVT::v2i64:
- case MVT::v2f64: {
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::VReg_128RegClass, 29);
- }
- case MVT::v8i32:
- case MVT::v8f32: {
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::VReg_256RegClass, 25);
-
- }
- case MVT::v16i32:
- case MVT::v16f32: {
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::VReg_512RegClass, 17);
-
- }
- default:
- return false;
- }
-}
-
#include "AMDGPUGenCallingConv.inc"
// Find a larger type to do a load / store of a vector with.
@@ -208,7 +131,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
}
- for (MVT VT : MVT::integer_vector_valuetypes()) {
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
@@ -218,6 +141,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
@@ -225,8 +151,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
@@ -286,8 +215,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
+ setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
+ setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -571,6 +503,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FABS);
setTargetDAGCombine(ISD::AssertZext);
setTargetDAGCombine(ISD::AssertSext);
+ setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
}
//===----------------------------------------------------------------------===//
@@ -630,15 +563,26 @@ static bool hasSourceMods(const SDNode *N) {
case ISD::FREM:
case ISD::INLINEASM:
case ISD::INLINEASM_BR:
- case AMDGPUISD::INTERP_P1:
- case AMDGPUISD::INTERP_P2:
case AMDGPUISD::DIV_SCALE:
+ case ISD::INTRINSIC_W_CHAIN:
// TODO: Should really be looking at the users of the bitcast. These are
// problematic because bitcasts are used to legalize all stores to integer
// types.
case ISD::BITCAST:
return false;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
+ case Intrinsic::amdgcn_interp_p1:
+ case Intrinsic::amdgcn_interp_p2:
+ case Intrinsic::amdgcn_interp_mov:
+ case Intrinsic::amdgcn_interp_p1_f16:
+ case Intrinsic::amdgcn_interp_p2_f16:
+ return false;
+ default:
+ return true;
+ }
+ }
default:
return true;
}
@@ -745,8 +689,9 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
return false;
bool Fast = false;
- return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy,
- MMO, &Fast) && Fast;
+ return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ CastTy, MMO, &Fast) &&
+ Fast;
}
// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
@@ -782,9 +727,8 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
break;
case ISD::LOAD:
{
- const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
- if (L->getMemOperand()->getAddrSpace()
- == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+ if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
+ AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return true;
return false;
}
@@ -1199,9 +1143,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::FLOG:
- return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
+ return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
case ISD::FLOG10:
- return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
+ return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
case ISD::FEXP:
return lowerFEXP(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
@@ -1236,7 +1180,7 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
}
}
-static bool hasDefinedInitializer(const GlobalValue *GV) {
+bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
if (!GVar || !GVar->hasInitializer())
return false;
@@ -2349,30 +2293,13 @@ SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
}
-// Return M_LOG2E of appropriate type
-static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) {
- switch (VT.getScalarType().getSimpleVT().SimpleTy) {
- case MVT::f32:
- return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT);
- case MVT::f16:
- return DAG.getConstantFP(
- APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"),
- SL, VT);
- case MVT::f64:
- return DAG.getConstantFP(
- APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT);
- default:
- llvm_unreachable("unsupported fp type");
- }
-}
-
// exp2(M_LOG2E_F * f);
SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
- const SDValue K = getLog2EVal(DAG, SL, VT);
+ const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
}
@@ -2836,8 +2763,16 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
static SDValue simplifyI24(SDNode *Node24,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
- SDValue LHS = Node24->getOperand(0);
- SDValue RHS = Node24->getOperand(1);
+ bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
+
+ SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
+ SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
+ unsigned NewOpcode = Node24->getOpcode();
+ if (IsIntrin) {
+ unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
+ NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
+ AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+ }
APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
@@ -2847,7 +2782,7 @@ static SDValue simplifyI24(SDNode *Node24,
SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
if (DemandedLHS || DemandedRHS)
- return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(),
+ return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
DemandedLHS ? DemandedLHS : LHS,
DemandedRHS ? DemandedRHS : RHS);
@@ -2904,54 +2839,6 @@ bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
return true;
}
-// Find a load or store from corresponding pattern root.
-// Roots may be build_vector, bitconvert or their combinations.
-static MemSDNode* findMemSDNode(SDNode *N) {
- N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
- if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
- return MN;
- assert(isa<BuildVectorSDNode>(N));
- for (SDValue V : N->op_values())
- if (MemSDNode *MN =
- dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
- return MN;
- llvm_unreachable("cannot find MemSDNode in the pattern!");
-}
-
-bool AMDGPUTargetLowering::SelectFlatOffset(bool IsSigned,
- SelectionDAG &DAG,
- SDNode *N,
- SDValue Addr,
- SDValue &VAddr,
- SDValue &Offset,
- SDValue &SLC) const {
- const GCNSubtarget &ST =
- DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
- int64_t OffsetVal = 0;
-
- if (ST.hasFlatInstOffsets() &&
- (!ST.hasFlatSegmentOffsetBug() ||
- findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
- DAG.isBaseWithConstantOffset(Addr)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
- int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
-
- const SIInstrInfo *TII = ST.getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, findMemSDNode(N)->getAddressSpace(),
- IsSigned)) {
- Addr = N0;
- OffsetVal = COffsetVal;
- }
- }
-
- VAddr = Addr;
- Offset = DAG.getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
- SLC = DAG.getTargetConstant(0, SDLoc(), MVT::i1);
-
- return true;
-}
-
// Replace load of an illegal type with a store of a bitcast to a friendlier
// type.
SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
@@ -3085,6 +2972,19 @@ SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
return SDValue();
}
+
+SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IID) {
+ case Intrinsic::amdgcn_mul_i24:
+ case Intrinsic::amdgcn_mul_u24:
+ return simplifyI24(N, DCI);
+ default:
+ return SDValue();
+ }
+}
+
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
/// binary operation \p Opc to it with the corresponding constant operands.
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -4173,6 +4073,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AssertZext:
case ISD::AssertSext:
return performAssertSZExtCombine(N, DCI);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return performIntrinsicWOChainCombine(N, DCI);
}
return SDValue();
}
@@ -4203,14 +4105,28 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
}
+// This may be called multiple times, and nothing prevents creating multiple
+// objects at the same offset. See if we already defined this object.
+static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
+ int64_t Offset) {
+ for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
+ if (MFI.getObjectOffset(I) == Offset) {
+ assert(MFI.getObjectSize(I) == Size);
+ return I;
+ }
+ }
+
+ return MFI.CreateFixedObject(Size, Offset, true);
+}
+
SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
EVT VT,
const SDLoc &SL,
int64_t Offset) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
+ int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
- int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
@@ -4260,7 +4176,7 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
const AMDGPUSubtarget &ST =
AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
- unsigned Alignment = ST.getAlignmentForImplicitArgPtr();
+ const Align Alignment = ST.getAlignmentForImplicitArgPtr();
uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
ExplicitArgOffset;
switch (Param) {
@@ -4295,6 +4211,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(SETCC)
NODE_NAME_CASE(SETREG)
+ NODE_NAME_CASE(DENORM_MODE)
NODE_NAME_CASE(FMA_W_CHAIN)
NODE_NAME_CASE(FMUL_W_CHAIN)
NODE_NAME_CASE(CLAMP)
@@ -4377,13 +4294,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(KILL)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
- NODE_NAME_CASE(INIT_EXEC)
- NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
- NODE_NAME_CASE(SENDMSG)
- NODE_NAME_CASE(SENDMSGHALT)
- NODE_NAME_CASE(INTERP_MOV)
- NODE_NAME_CASE(INTERP_P1)
- NODE_NAME_CASE(INTERP_P2)
NODE_NAME_CASE(INTERP_P1LL_F16)
NODE_NAME_CASE(INTERP_P1LV_F16)
NODE_NAME_CASE(INTERP_P2_F16)
@@ -4428,6 +4338,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_AND)
NODE_NAME_CASE(BUFFER_ATOMIC_OR)
NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
+ NODE_NAME_CASE(BUFFER_ATOMIC_INC)
+ NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
@@ -4576,9 +4488,9 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
} else if (SelBits == 0x0c) {
- Known.Zero |= 0xff << I;
+ Known.Zero |= 0xFFull << I;
} else if (SelBits > 0x0c) {
- Known.One |= 0xff << I;
+ Known.One |= 0xFFull << I;
}
Sel >>= 8;
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index fe7ad694943d..dea0d1d4343a 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -38,6 +38,7 @@ private:
public:
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG);
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
+ static bool hasDefinedInitializer(const GlobalValue *GV);
protected:
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -78,6 +79,7 @@ protected:
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
unsigned Opc, SDValue LHS,
@@ -324,10 +326,6 @@ public:
}
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
-
- bool SelectFlatOffset(bool IsSigned, SelectionDAG &DAG, SDNode *N,
- SDValue Addr, SDValue &VAddr, SDValue &Offset,
- SDValue &SLC) const;
};
namespace AMDGPUISD {
@@ -369,6 +367,9 @@ enum NodeType : unsigned {
// result bit per item in the wavefront.
SETCC,
SETREG,
+
+ DENORM_MODE,
+
// FP ops with input and output chain.
FMA_W_CHAIN,
FMUL_W_CHAIN,
@@ -475,13 +476,6 @@ enum NodeType : unsigned {
BUILD_VERTICAL_VECTOR,
/// Pointer to the start of the shader's constant data.
CONST_DATA_PTR,
- INIT_EXEC,
- INIT_EXEC_FROM_INPUT,
- SENDMSG,
- SENDMSGHALT,
- INTERP_MOV,
- INTERP_P1,
- INTERP_P2,
INTERP_P1LL_F16,
INTERP_P1LV_F16,
INTERP_P2_F16,
@@ -532,6 +526,8 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_AND,
BUFFER_ATOMIC_OR,
BUFFER_ATOMIC_XOR,
+ BUFFER_ATOMIC_INC,
+ BUFFER_ATOMIC_DEC,
BUFFER_ATOMIC_CMPSWAP,
BUFFER_ATOMIC_FADD,
BUFFER_ATOMIC_PK_FADD,
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index f4df20b8f03e..a83ec23ec054 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -51,7 +51,7 @@ ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
// Inliner constraint to achieve reasonable compilation time
static cl::opt<size_t>
-MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300),
+MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
cl::desc("Maximum BB number allowed in a function after inlining"
" (compile time constraint)"));
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 4a8446955496..cf0ce5659951 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -110,39 +110,38 @@ def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
// Force dependencies for vector trunc stores
def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>;
-def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
-def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
-
+def AMDGPUcos_impl : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
+def AMDGPUsin_impl : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
// out = a - floor(a)
-def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
+def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
// out = 1.0 / a
-def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
+def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a)
-def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
+def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a)
-def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
-def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+def AMDGPUrsq_legacy_impl : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+def AMDGPUrcp_legacy_impl : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a) result clamped to +/- max_float.
-def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
+def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
-def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
+def AMDGPUldexp_impl : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
-def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
-def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
-def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
-def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
-def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
+def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
+def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
+def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
-def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
+def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
// out = max(a, b) a and b are floats, where a nan comparison fails.
// This is not commutative because this gives the second operand:
@@ -285,7 +284,7 @@ def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
-def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
+def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>;
@@ -320,7 +319,7 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
[]
>;
-def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
@@ -330,35 +329,6 @@ def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
-def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
- SDTypeProfile<0, 1, [SDTCisInt<0>]>,
- [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT",
- SDTypeProfile<0, 2,
- [SDTCisInt<0>, SDTCisInt<1>]>,
- [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
- SDTypeProfile<0, 1, [SDTCisInt<0>]>,
- [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUsendmsghalt : SDNode<"AMDGPUISD::SENDMSGHALT",
- SDTypeProfile<0, 1, [SDTCisInt<0>]>,
- [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV",
- SDTypeProfile<1, 3, [SDTCisFP<0>]>,
- [SDNPInGlue]>;
-
-def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1",
- SDTypeProfile<1, 3, [SDTCisFP<0>]>,
- [SDNPInGlue, SDNPOutGlue]>;
-
-def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2",
- SDTypeProfile<1, 4, [SDTCisFP<0>]>,
- [SDNPInGlue]>;
-
def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16",
SDTypeProfile<1, 7, [SDTCisFP<0>]>,
[SDNPInGlue, SDNPOutGlue]>;
@@ -425,3 +395,65 @@ def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
+
+
+//===----------------------------------------------------------------------===//
+// Intrinsic/Custom node compatability PatFrags
+//===----------------------------------------------------------------------===//
+
+def AMDGPUrcp : PatFrags<(ops node:$src), [(int_amdgcn_rcp node:$src),
+ (AMDGPUrcp_impl node:$src)]>;
+def AMDGPUrcp_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rcp_legacy node:$src),
+ (AMDGPUrcp_legacy_impl node:$src)]>;
+
+def AMDGPUrsq_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rsq_legacy node:$src),
+ (AMDGPUrsq_legacy_impl node:$src)]>;
+
+def AMDGPUrsq : PatFrags<(ops node:$src), [(int_amdgcn_rsq node:$src),
+ (AMDGPUrsq_impl node:$src)]>;
+
+def AMDGPUrsq_clamp : PatFrags<(ops node:$src), [(int_amdgcn_rsq_clamp node:$src),
+ (AMDGPUrsq_clamp_impl node:$src)]>;
+
+def AMDGPUsin : PatFrags<(ops node:$src), [(int_amdgcn_sin node:$src),
+ (AMDGPUsin_impl node:$src)]>;
+def AMDGPUcos : PatFrags<(ops node:$src), [(int_amdgcn_cos node:$src),
+ (AMDGPUcos_impl node:$src)]>;
+def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src),
+ (AMDGPUfract_impl node:$src)]>;
+
+def AMDGPUldexp : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_ldexp node:$src0, node:$src1),
+ (AMDGPUldexp_impl node:$src0, node:$src1)]>;
+
+def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_class node:$src0, node:$src1),
+ (AMDGPUfp_class_impl node:$src0, node:$src1)]>;
+
+def AMDGPUfmed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+ [(int_amdgcn_fmed3 node:$src0, node:$src1, node:$src2),
+ (AMDGPUfmed3_impl node:$src0, node:$src1, node:$src2)]>;
+
+def AMDGPUffbh_i32 : PatFrags<(ops node:$src),
+ [(int_amdgcn_sffbh node:$src),
+ (AMDGPUffbh_i32_impl node:$src)]>;
+
+def AMDGPUpkrtz_f16_f32 : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_cvt_pkrtz node:$src0, node:$src1),
+ (AMDGPUpkrtz_f16_f32_impl node:$src0, node:$src1)]>;
+
+def AMDGPUpknorm_i16_f32 : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_cvt_pknorm_i16 node:$src0, node:$src1),
+ (AMDGPUpknorm_i16_f32_impl node:$src0, node:$src1)]>;
+
+def AMDGPUpknorm_u16_f32 : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_cvt_pknorm_u16 node:$src0, node:$src1),
+ (AMDGPUpknorm_u16_f32_impl node:$src0, node:$src1)]>;
+
+def AMDGPUpk_i16_i32 : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_cvt_pk_i16 node:$src0, node:$src1),
+ (AMDGPUpk_i16_i32_impl node:$src0, node:$src1)]>;
+
+def AMDGPUpk_u16_u32 : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_cvt_pk_u16 node:$src0, node:$src1),
+ (AMDGPUpk_u16_u32_impl node:$src0, node:$src1)]>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 901a2eaa8829..3cfa9d57ec46 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -19,8 +19,10 @@
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -61,8 +63,14 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector(
const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
+void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
+ CodeGenCoverage &CoverageInfo) {
+ MRI = &MF.getRegInfo();
+ InstructionSelector::setupMF(MF, KB, CoverageInfo);
+}
+
static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return Reg == AMDGPU::SCC;
auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
@@ -71,7 +79,9 @@ static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
if (RC) {
// FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the
// context of the register bank has been lost.
- if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID)
+ // Has a hack getRegClassForSizeOnBank uses exactly SGPR_32RegClass, which
+ // won't ever beconstrained any further.
+ if (RC != &AMDGPU::SGPR_32RegClass)
return false;
const LLT Ty = MRI.getType(Reg);
return Ty.isValid() && Ty.getSizeInBits() == 1;
@@ -83,7 +93,7 @@ static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
bool AMDGPUInstructionSelector::isVCC(Register Reg,
const MachineRegisterInfo &MRI) const {
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return Reg == TRI.getVCC();
auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
@@ -102,8 +112,6 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
I.setDesc(TII.get(TargetOpcode::COPY));
const MachineOperand &Src = I.getOperand(1);
@@ -111,33 +119,33 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
Register DstReg = Dst.getReg();
Register SrcReg = Src.getReg();
- if (isVCC(DstReg, MRI)) {
+ if (isVCC(DstReg, *MRI)) {
if (SrcReg == AMDGPU::SCC) {
const TargetRegisterClass *RC
- = TRI.getConstrainedRegClassForOperand(Dst, MRI);
+ = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
if (!RC)
return true;
- return RBI.constrainGenericRegister(DstReg, *RC, MRI);
+ return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
}
- if (!isVCC(SrcReg, MRI)) {
+ if (!isVCC(SrcReg, *MRI)) {
// TODO: Should probably leave the copy and let copyPhysReg expand it.
- if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), MRI))
+ if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
return false;
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
.addImm(0)
.addReg(SrcReg);
- if (!MRI.getRegClassOrNull(SrcReg))
- MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI));
+ if (!MRI->getRegClassOrNull(SrcReg))
+ MRI->setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, *MRI));
I.eraseFromParent();
return true;
}
const TargetRegisterClass *RC =
- TRI.getConstrainedRegClassForOperand(Dst, MRI);
- if (RC && !RBI.constrainGenericRegister(DstReg, *RC, MRI))
+ TRI.getConstrainedRegClassForOperand(Dst, *MRI);
+ if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
return false;
// Don't constrain the source register to a class so the def instruction
@@ -148,8 +156,8 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
// with size 1. An SReg_32 with size 1 is ambiguous with wave32.
if (Src.isUndef()) {
const TargetRegisterClass *SrcRC =
- TRI.getConstrainedRegClassForOperand(Src, MRI);
- if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI))
+ TRI.getConstrainedRegClassForOperand(Src, *MRI);
+ if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
return false;
}
@@ -157,30 +165,26 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
}
for (const MachineOperand &MO : I.operands()) {
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ if (Register::isPhysicalRegister(MO.getReg()))
continue;
const TargetRegisterClass *RC =
- TRI.getConstrainedRegClassForOperand(MO, MRI);
+ TRI.getConstrainedRegClassForOperand(MO, *MRI);
if (!RC)
continue;
- RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+ RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
}
return true;
}
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
const Register DefReg = I.getOperand(0).getReg();
- const LLT DefTy = MRI.getType(DefReg);
+ const LLT DefTy = MRI->getType(DefReg);
// TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
const RegClassOrRegBank &RegClassOrBank =
- MRI.getRegClassOrRegBank(DefReg);
+ MRI->getRegClassOrRegBank(DefReg);
const TargetRegisterClass *DefRC
= RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
@@ -196,7 +200,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
return false;
}
- DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI);
+ DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
if (!DefRC) {
LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
return false;
@@ -204,7 +208,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
}
I.setDesc(TII.get(TargetOpcode::PHI));
- return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
+ return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
}
MachineOperand
@@ -214,13 +218,11 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
MachineInstr *MI = MO.getParent();
MachineBasicBlock *BB = MO.getParent()->getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- Register DstReg = MRI.createVirtualRegister(&SubRC);
+ Register DstReg = MRI->createVirtualRegister(&SubRC);
if (MO.isReg()) {
unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
.addReg(Reg, 0, ComposedSubIdx);
@@ -244,10 +246,6 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
}
}
-static int64_t getConstant(const MachineInstr *MI) {
- return MI->getOperand(1).getCImm()->getSExtValue();
-}
-
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
switch (Opc) {
case AMDGPU::G_AND:
@@ -262,16 +260,13 @@ static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
}
bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineOperand &Dst = I.getOperand(0);
MachineOperand &Src0 = I.getOperand(1);
MachineOperand &Src1 = I.getOperand(2);
Register DstReg = Dst.getReg();
- unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+ unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
- const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
if (DstRB->getID() == AMDGPU::VCCRegBankID) {
const TargetRegisterClass *RC = TRI.getBoolRC();
unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(),
@@ -282,12 +277,12 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
// The selector for G_ICMP relies on seeing the register bank for the result
// is VCC. In wave32 if we constrain the registers to SReg_32 here, it will
// be ambiguous whether it's a scalar or vector bool.
- if (Src0.isUndef() && !MRI.getRegClassOrNull(Src0.getReg()))
- MRI.setRegClass(Src0.getReg(), RC);
- if (Src1.isUndef() && !MRI.getRegClassOrNull(Src1.getReg()))
- MRI.setRegClass(Src1.getReg(), RC);
+ if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg()))
+ MRI->setRegClass(Src0.getReg(), RC);
+ if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg()))
+ MRI->setRegClass(Src1.getReg(), RC);
- return RBI.constrainGenericRegister(DstReg, *RC, MRI);
+ return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
}
// TODO: Should this allow an SCC bank result, and produce a copy from SCC for
@@ -295,14 +290,7 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
if (DstRB->getID() == AMDGPU::SGPRRegBankID) {
unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32);
I.setDesc(TII.get(InstOpc));
-
- const TargetRegisterClass *RC
- = TRI.getConstrainedRegClassForOperand(Dst, MRI);
- if (!RC)
- return false;
- return RBI.constrainGenericRegister(DstReg, *RC, MRI) &&
- RBI.constrainGenericRegister(Src0.getReg(), *RC, MRI) &&
- RBI.constrainGenericRegister(Src1.getReg(), *RC, MRI);
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
return false;
@@ -311,11 +299,10 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
Register DstReg = I.getOperand(0).getReg();
const DebugLoc &DL = I.getDebugLoc();
- unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
- const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
@@ -340,7 +327,7 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64;
- Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass());
+ Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
MachineInstr *Add
= BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
.addDef(UnusedCarry, RegState::Dead)
@@ -363,8 +350,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
- Register DstLo = MRI.createVirtualRegister(&HalfRC);
- Register DstHi = MRI.createVirtualRegister(&HalfRC);
+ Register DstLo = MRI->createVirtualRegister(&HalfRC);
+ Register DstHi = MRI->createVirtualRegister(&HalfRC);
if (IsSALU) {
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
@@ -375,14 +362,14 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
.add(Hi2);
} else {
const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
- Register CarryReg = MRI.createVirtualRegister(CarryRC);
+ Register CarryReg = MRI->createVirtualRegister(CarryRC);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo)
.addDef(CarryReg)
.add(Lo1)
.add(Lo2)
.addImm(0);
MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
- .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead)
+ .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
.add(Hi1)
.add(Hi2)
.addReg(CarryReg, RegState::Kill)
@@ -399,19 +386,61 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
.addImm(AMDGPU::sub1);
- if (!RBI.constrainGenericRegister(DstReg, RC, MRI))
+ if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
return false;
I.eraseFromParent();
return true;
}
-bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_UADDO_USUBO(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- assert(I.getOperand(2).getImm() % 32 == 0);
- unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32);
+ const DebugLoc &DL = I.getDebugLoc();
+ Register Dst0Reg = I.getOperand(0).getReg();
+ Register Dst1Reg = I.getOperand(1).getReg();
+ const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO;
+
+ if (!isSCC(Dst1Reg, MRI)) {
+ // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
+ // carry out despite the _i32 name. These were renamed in VI to _U32.
+ // FIXME: We should probably rename the opcodes here.
+ unsigned NewOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+ I.setDesc(TII.get(NewOpc));
+ I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+ I.addOperand(*MF, MachineOperand::CreateImm(0));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+
+ Register Src0Reg = I.getOperand(2).getReg();
+ Register Src1Reg = I.getOperand(3).getReg();
+ unsigned NewOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+ BuildMI(*BB, &I, DL, TII.get(NewOpc), Dst0Reg)
+ .add(I.getOperand(2))
+ .add(I.getOperand(3));
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
+ .addReg(AMDGPU::SCC);
+
+ if (!MRI.getRegClassOrNull(Dst1Reg))
+ MRI.setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
+
+ if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, MRI) ||
+ !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, MRI) ||
+ !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, MRI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ unsigned Offset = I.getOperand(2).getImm();
+ if (Offset % 32 != 0)
+ return false;
+
+ unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32);
const DebugLoc &DL = I.getDebugLoc();
MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY),
I.getOperand(0).getReg())
@@ -419,10 +448,10 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
for (const MachineOperand &MO : Copy->operands()) {
const TargetRegisterClass *RC =
- TRI.getConstrainedRegClassForOperand(MO, MRI);
+ TRI.getConstrainedRegClassForOperand(MO, *MRI);
if (!RC)
continue;
- RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+ RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
}
I.eraseFromParent();
return true;
@@ -430,21 +459,19 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
MachineBasicBlock *BB = MI.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
Register DstReg = MI.getOperand(0).getReg();
- LLT DstTy = MRI.getType(DstReg);
- LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+ LLT DstTy = MRI->getType(DstReg);
+ LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
const unsigned SrcSize = SrcTy.getSizeInBits();
if (SrcSize < 32)
return false;
const DebugLoc &DL = MI.getDebugLoc();
- const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
const unsigned DstSize = DstTy.getSizeInBits();
const TargetRegisterClass *DstRC =
- TRI.getRegClassForSizeOnBank(DstSize, *DstBank, MRI);
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
if (!DstRC)
return false;
@@ -457,12 +484,12 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
MIB.addImm(SubRegs[I]);
const TargetRegisterClass *SrcRC
- = TRI.getConstrainedRegClassForOperand(Src, MRI);
- if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, MRI))
+ = TRI.getConstrainedRegClassForOperand(Src, *MRI);
+ if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
return false;
}
- if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI))
+ if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
return false;
MI.eraseFromParent();
@@ -471,25 +498,23 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
MachineBasicBlock *BB = MI.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
const int NumDst = MI.getNumOperands() - 1;
MachineOperand &Src = MI.getOperand(NumDst);
Register SrcReg = Src.getReg();
Register DstReg0 = MI.getOperand(0).getReg();
- LLT DstTy = MRI.getType(DstReg0);
- LLT SrcTy = MRI.getType(SrcReg);
+ LLT DstTy = MRI->getType(DstReg0);
+ LLT SrcTy = MRI->getType(SrcReg);
const unsigned DstSize = DstTy.getSizeInBits();
const unsigned SrcSize = SrcTy.getSizeInBits();
const DebugLoc &DL = MI.getDebugLoc();
- const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
+ const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
const TargetRegisterClass *SrcRC =
- TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, MRI);
- if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI))
+ TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
+ if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
return false;
const unsigned SrcFlags = getUndefRegState(Src.isUndef());
@@ -504,8 +529,8 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
.addReg(SrcReg, SrcFlags, SubRegs[I]);
const TargetRegisterClass *DstRC =
- TRI.getConstrainedRegClassForOperand(Dst, MRI);
- if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, MRI))
+ TRI.getConstrainedRegClassForOperand(Dst, *MRI);
+ if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
return false;
}
@@ -518,16 +543,13 @@ bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
}
bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
const MachineOperand &MO = I.getOperand(0);
// FIXME: Interface for getConstrainedRegClassForOperand needs work. The
// regbank check here is to know why getConstrainedRegClassForOperand failed.
- const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI);
- if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) ||
- (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) {
+ const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
+ if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
+ (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
return true;
}
@@ -537,44 +559,62 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32);
- DebugLoc DL = I.getDebugLoc();
- MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG))
- .addDef(I.getOperand(0).getReg())
- .addReg(I.getOperand(1).getReg())
- .addReg(I.getOperand(2).getReg())
- .addImm(SubReg);
-
- for (const MachineOperand &MO : Ins->operands()) {
- if (!MO.isReg())
- continue;
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
- continue;
- const TargetRegisterClass *RC =
- TRI.getConstrainedRegClassForOperand(MO, MRI);
- if (!RC)
- continue;
- RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
- }
+ Register DstReg = I.getOperand(0).getReg();
+ Register Src0Reg = I.getOperand(1).getReg();
+ Register Src1Reg = I.getOperand(2).getReg();
+ LLT Src1Ty = MRI->getType(Src1Reg);
+
+ unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
+ unsigned InsSize = Src1Ty.getSizeInBits();
+
+ int64_t Offset = I.getOperand(3).getImm();
+ if (Offset % 32 != 0)
+ return false;
+
+ unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
+ if (SubReg == AMDGPU::NoSubRegister)
+ return false;
+
+ const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
+ const TargetRegisterClass *DstRC =
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+ if (!DstRC)
+ return false;
+
+ const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
+ const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
+ const TargetRegisterClass *Src0RC =
+ TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
+ const TargetRegisterClass *Src1RC =
+ TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
+
+ // Deal with weird cases where the class only partially supports the subreg
+ // index.
+ Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
+ if (!Src0RC)
+ return false;
+
+ if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
+ !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
+ !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
+ return false;
+
+ const DebugLoc &DL = I.getDebugLoc();
+ BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
+ .addReg(Src0Reg)
+ .addReg(Src1Reg)
+ .addImm(SubReg);
+
I.eraseFromParent();
return true;
}
-bool AMDGPUInstructionSelector::selectG_INTRINSIC(
- MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
- unsigned IntrinsicID = I.getOperand(I.getNumExplicitDefs()).getIntrinsicID();
+bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
+ unsigned IntrinsicID = I.getIntrinsicID();
switch (IntrinsicID) {
- case Intrinsic::maxnum:
- case Intrinsic::minnum:
- case Intrinsic::amdgcn_cvt_pkrtz:
- return selectImpl(I, CoverageInfo);
case Intrinsic::amdgcn_if_break: {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
// FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
// SelectionDAG uses for wave32 vs wave64.
@@ -589,15 +629,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(
I.eraseFromParent();
- for (Register Reg : { DstReg, Src0Reg, Src1Reg }) {
- if (!MRI.getRegClassOrNull(Reg))
- MRI.setRegClass(Reg, TRI.getWaveMaskRegClass());
- }
+ for (Register Reg : { DstReg, Src0Reg, Src1Reg })
+ MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
return true;
}
default:
- return selectImpl(I, CoverageInfo);
+ return selectImpl(I, *CoverageInfo);
}
}
@@ -677,17 +715,15 @@ int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = I.getDebugLoc();
- unsigned SrcReg = I.getOperand(2).getReg();
- unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI);
+ Register SrcReg = I.getOperand(2).getReg();
+ unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
- unsigned CCReg = I.getOperand(0).getReg();
- if (isSCC(CCReg, MRI)) {
+ Register CCReg = I.getOperand(0).getReg();
+ if (isSCC(CCReg, *MRI)) {
int Opcode = getS_CMPOpcode(Pred, Size);
if (Opcode == -1)
return false;
@@ -698,7 +734,7 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
.addReg(AMDGPU::SCC);
bool Ret =
constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
- RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI);
+ RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
I.eraseFromParent();
return Ret;
}
@@ -712,7 +748,7 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
.add(I.getOperand(2))
.add(I.getOperand(3));
RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
- *TRI.getBoolRC(), MRI);
+ *TRI.getBoolRC(), *MRI);
bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
I.eraseFromParent();
return Ret;
@@ -736,19 +772,273 @@ buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
.addImm(Enabled);
}
+static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
+ int64_t C;
+ if (mi_match(Reg, MRI, m_ICst(C)) && C == 0)
+ return true;
+
+ // FIXME: matcher should ignore copies
+ return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0;
+}
+
+static unsigned extractGLC(unsigned AuxiliaryData) {
+ return AuxiliaryData & 1;
+}
+
+static unsigned extractSLC(unsigned AuxiliaryData) {
+ return (AuxiliaryData >> 1) & 1;
+}
+
+static unsigned extractDLC(unsigned AuxiliaryData) {
+ return (AuxiliaryData >> 2) & 1;
+}
+
+static unsigned extractSWZ(unsigned AuxiliaryData) {
+ return (AuxiliaryData >> 3) & 1;
+}
+
+// Returns Base register, constant offset, and offset def point.
+static std::tuple<Register, unsigned, MachineInstr *>
+getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
+ MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+ if (!Def)
+ return std::make_tuple(Reg, 0, nullptr);
+
+ if (Def->getOpcode() == AMDGPU::G_CONSTANT) {
+ unsigned Offset;
+ const MachineOperand &Op = Def->getOperand(1);
+ if (Op.isImm())
+ Offset = Op.getImm();
+ else
+ Offset = Op.getCImm()->getZExtValue();
+
+ return std::make_tuple(Register(), Offset, Def);
+ }
+
+ int64_t Offset;
+ if (Def->getOpcode() == AMDGPU::G_ADD) {
+ // TODO: Handle G_OR used for add case
+ if (mi_match(Def->getOperand(1).getReg(), MRI, m_ICst(Offset)))
+ return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def);
+
+ // FIXME: matcher should ignore copies
+ if (mi_match(Def->getOperand(1).getReg(), MRI, m_Copy(m_ICst(Offset))))
+ return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def);
+ }
+
+ return std::make_tuple(Reg, 0, Def);
+}
+
+static unsigned getBufferStoreOpcode(LLT Ty,
+ const unsigned MemSize,
+ const bool Offen) {
+ const int Size = Ty.getSizeInBits();
+ switch (8 * MemSize) {
+ case 8:
+ return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
+ case 16:
+ return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
+ default:
+ unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
+ if (Size > 32)
+ Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
+ return Opc;
+ }
+}
+
+static unsigned getBufferStoreFormatOpcode(LLT Ty,
+ const unsigned MemSize,
+ const bool Offen) {
+ bool IsD16Packed = Ty.getScalarSizeInBits() == 16;
+ bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits();
+ int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
+
+ if (IsD16Packed) {
+ switch (NumElts) {
+ case 1:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
+ case 2:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact;
+ case 3:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact;
+ case 4:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact;
+ default:
+ return -1;
+ }
+ }
+
+ if (IsD16Unpacked) {
+ switch (NumElts) {
+ case 1:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
+ case 2:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact;
+ case 3:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact;
+ case 4:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact;
+ default:
+ return -1;
+ }
+ }
+
+ switch (NumElts) {
+ case 1:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact;
+ case 2:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact;
+ case 3:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact;
+ case 4:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact;
+ default:
+ return -1;
+ }
+
+ llvm_unreachable("unhandled buffer store");
+}
+
+// TODO: Move this to combiner
+// Returns base register, imm offset, total constant offset.
+std::tuple<Register, unsigned, unsigned>
+AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B,
+ Register OrigOffset) const {
+ const unsigned MaxImm = 4095;
+ Register BaseReg;
+ unsigned TotalConstOffset;
+ MachineInstr *OffsetDef;
+
+ std::tie(BaseReg, TotalConstOffset, OffsetDef)
+ = getBaseWithConstantOffset(*MRI, OrigOffset);
+
+ unsigned ImmOffset = TotalConstOffset;
+
+ // If the immediate value is too big for the immoffset field, put the value
+ // and -4096 into the immoffset field so that the value that is copied/added
+ // for the voffset field is a multiple of 4096, and it stands more chance
+ // of being CSEd with the copy/add for another similar load/store.f
+ // However, do not do that rounding down to a multiple of 4096 if that is a
+ // negative number, as it appears to be illegal to have a negative offset
+ // in the vgpr, even if adding the immediate offset makes it positive.
+ unsigned Overflow = ImmOffset & ~MaxImm;
+ ImmOffset -= Overflow;
+ if ((int32_t)Overflow < 0) {
+ Overflow += ImmOffset;
+ ImmOffset = 0;
+ }
+
+ if (Overflow != 0) {
+ // In case this is in a waterfall loop, insert offset code at the def point
+ // of the offset, not inside the loop.
+ MachineBasicBlock::iterator OldInsPt = B.getInsertPt();
+ MachineBasicBlock &OldMBB = B.getMBB();
+ B.setInstr(*OffsetDef);
+
+ if (!BaseReg) {
+ BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ B.buildInstr(AMDGPU::V_MOV_B32_e32)
+ .addDef(BaseReg)
+ .addImm(Overflow);
+ } else {
+ Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ B.buildInstr(AMDGPU::V_MOV_B32_e32)
+ .addDef(OverflowVal)
+ .addImm(Overflow);
+
+ Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg)
+ .addReg(BaseReg)
+ .addReg(OverflowVal, RegState::Kill)
+ .addImm(0);
+ BaseReg = NewBaseReg;
+ }
+
+ B.setInsertPt(OldMBB, OldInsPt);
+ }
+
+ return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
+}
+
+bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI,
+ bool IsFormat) const {
+ MachineIRBuilder B(MI);
+ MachineFunction &MF = B.getMF();
+ Register VData = MI.getOperand(1).getReg();
+ LLT Ty = MRI->getType(VData);
+
+ int Size = Ty.getSizeInBits();
+ if (Size % 32 != 0)
+ return false;
+
+ // FIXME: Verifier should enforce 1 MMO for these intrinsics.
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+ const int MemSize = MMO->getSize();
+
+ Register RSrc = MI.getOperand(2).getReg();
+ Register VOffset = MI.getOperand(3).getReg();
+ Register SOffset = MI.getOperand(4).getReg();
+ unsigned AuxiliaryData = MI.getOperand(5).getImm();
+ unsigned ImmOffset;
+ unsigned TotalOffset;
+
+ std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
+ if (TotalOffset != 0)
+ MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize);
+
+ const bool Offen = !isZero(VOffset, *MRI);
+
+ int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) :
+ getBufferStoreOpcode(Ty, MemSize, Offen);
+ if (Opc == -1)
+ return false;
+
+ MachineInstrBuilder MIB = B.buildInstr(Opc)
+ .addUse(VData);
+
+ if (Offen)
+ MIB.addUse(VOffset);
+
+ MIB.addUse(RSrc)
+ .addUse(SOffset)
+ .addImm(ImmOffset)
+ .addImm(extractGLC(AuxiliaryData))
+ .addImm(extractSLC(AuxiliaryData))
+ .addImm(0) // tfe: FIXME: Remove from inst
+ .addImm(extractDLC(AuxiliaryData))
+ .addImm(extractSWZ(AuxiliaryData))
+ .addMemOperand(MMO);
+
+ MI.eraseFromParent();
+
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
- MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
+ MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
- unsigned IntrinsicID = I.getOperand(0).getIntrinsicID();
+ unsigned IntrinsicID = I.getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_exp: {
- int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
- int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
- int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg()));
- int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg()));
+ int64_t Tgt = I.getOperand(1).getImm();
+ int64_t Enabled = I.getOperand(2).getImm();
+ int64_t Done = I.getOperand(7).getImm();
+ int64_t VM = I.getOperand(8).getImm();
MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(),
I.getOperand(4).getReg(),
@@ -761,13 +1051,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
}
case Intrinsic::amdgcn_exp_compr: {
const DebugLoc &DL = I.getDebugLoc();
- int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
- int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
- unsigned Reg0 = I.getOperand(3).getReg();
- unsigned Reg1 = I.getOperand(4).getReg();
- unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg()));
- int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg()));
+ int64_t Tgt = I.getOperand(1).getImm();
+ int64_t Enabled = I.getOperand(2).getImm();
+ Register Reg0 = I.getOperand(3).getReg();
+ Register Reg1 = I.getOperand(4).getReg();
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ int64_t Done = I.getOperand(5).getImm();
+ int64_t VM = I.getOperand(6).getImm();
BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM,
@@ -786,27 +1076,29 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
Register Reg = I.getOperand(1).getReg();
I.eraseFromParent();
- if (!MRI.getRegClassOrNull(Reg))
- MRI.setRegClass(Reg, TRI.getWaveMaskRegClass());
+ if (!MRI->getRegClassOrNull(Reg))
+ MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
return true;
}
+ case Intrinsic::amdgcn_raw_buffer_store:
+ return selectStoreIntrinsic(I, false);
+ case Intrinsic::amdgcn_raw_buffer_store_format:
+ return selectStoreIntrinsic(I, true);
default:
- return selectImpl(I, CoverageInfo);
+ return selectImpl(I, *CoverageInfo);
}
}
bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = I.getDebugLoc();
- unsigned DstReg = I.getOperand(0).getReg();
- unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+ Register DstReg = I.getOperand(0).getReg();
+ unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
assert(Size <= 32 || Size == 64);
const MachineOperand &CCOp = I.getOperand(1);
- unsigned CCReg = CCOp.getReg();
- if (isSCC(CCReg, MRI)) {
+ Register CCReg = CCOp.getReg();
+ if (isSCC(CCReg, *MRI)) {
unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
AMDGPU::S_CSELECT_B32;
MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
@@ -815,8 +1107,8 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
// The generic constrainSelectedInstRegOperands doesn't work for the scc register
// bank, because it does not cover the register class that we used to represent
// for it. So we need to manually set the register class here.
- if (!MRI.getRegClassOrNull(CCReg))
- MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI));
+ if (!MRI->getRegClassOrNull(CCReg))
+ MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
.add(I.getOperand(2))
.add(I.getOperand(3));
@@ -845,52 +1137,8 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
}
bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- DebugLoc DL = I.getDebugLoc();
- unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI);
- if (PtrSize != 64) {
- LLVM_DEBUG(dbgs() << "Unhandled address space\n");
- return false;
- }
-
- unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
- unsigned Opcode;
-
- // FIXME: Remove this when integers > s32 naturally selected.
- switch (StoreSize) {
- default:
- return false;
- case 32:
- Opcode = AMDGPU::FLAT_STORE_DWORD;
- break;
- case 64:
- Opcode = AMDGPU::FLAT_STORE_DWORDX2;
- break;
- case 96:
- Opcode = AMDGPU::FLAT_STORE_DWORDX3;
- break;
- case 128:
- Opcode = AMDGPU::FLAT_STORE_DWORDX4;
- break;
- }
-
- MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
- .add(I.getOperand(1))
- .add(I.getOperand(0))
- .addImm(0) // offset
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0); // dlc
-
-
- // Now that we selected an opcode, we need to constrain the register
- // operands to use appropriate classes.
- bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
-
- I.eraseFromParent();
- return Ret;
+ initM0(I);
+ return selectImpl(I, *CoverageInfo);
}
static int sizeToSubRegIndex(unsigned Size) {
@@ -915,19 +1163,15 @@ static int sizeToSubRegIndex(unsigned Size) {
}
bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
- unsigned DstReg = I.getOperand(0).getReg();
- unsigned SrcReg = I.getOperand(1).getReg();
- const LLT DstTy = MRI.getType(DstReg);
- const LLT SrcTy = MRI.getType(SrcReg);
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
+ const LLT DstTy = MRI->getType(DstReg);
+ const LLT SrcTy = MRI->getType(SrcReg);
if (!DstTy.isScalar())
return false;
- const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
- const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI);
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
if (SrcRB != DstRB)
return false;
@@ -935,9 +1179,9 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
unsigned SrcSize = SrcTy.getSizeInBits();
const TargetRegisterClass *SrcRC
- = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI);
+ = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
const TargetRegisterClass *DstRC
- = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI);
+ = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
if (SrcSize > 32) {
int SubRegIdx = sizeToSubRegIndex(DstSize);
@@ -953,8 +1197,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
I.getOperand(1).setSubReg(SubRegIdx);
}
- if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
- !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
return false;
}
@@ -974,20 +1218,18 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
bool Signed = I.getOpcode() == AMDGPU::G_SEXT;
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock &MBB = *I.getParent();
- MachineFunction &MF = *MBB.getParent();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
- const LLT DstTy = MRI.getType(DstReg);
- const LLT SrcTy = MRI.getType(SrcReg);
+ const LLT DstTy = MRI->getType(DstReg);
+ const LLT SrcTy = MRI->getType(SrcReg);
const LLT S1 = LLT::scalar(1);
const unsigned SrcSize = SrcTy.getSizeInBits();
const unsigned DstSize = DstTy.getSizeInBits();
if (!DstTy.isScalar())
return false;
- const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
+ const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
if (SrcBank->getID() == AMDGPU::SCCRegBankID) {
if (SrcTy != S1 || DstSize > 64) // Invalid
@@ -1000,7 +1242,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
// FIXME: Create an extra copy to avoid incorrectly constraining the result
// of the scc producer.
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg)
.addReg(SrcReg);
BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
@@ -1010,7 +1252,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
BuildMI(MBB, I, DL, TII.get(Opcode), DstReg)
.addImm(0)
.addImm(Signed ? -1 : 1);
- return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
}
if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) {
@@ -1024,6 +1267,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
.addImm(0) // src1_modifiers
.addImm(Signed ? -1 : 1) // src1
.addUse(SrcReg);
+ I.eraseFromParent();
return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
}
@@ -1040,6 +1284,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
.addImm(Mask)
.addReg(SrcReg);
+ I.eraseFromParent();
return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
}
@@ -1049,11 +1294,12 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
.addReg(SrcReg)
.addImm(0) // Offset
.addImm(SrcSize); // Width
+ I.eraseFromParent();
return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
}
if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
- if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI))
+ if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
return false;
if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
@@ -1061,7 +1307,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
.addReg(SrcReg);
- return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
}
const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
@@ -1070,10 +1317,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
// Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
if (DstSize > 32 && SrcSize <= 32) {
// We need a 64-bit register source, but the high bits don't matter.
- unsigned ExtReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned UndefReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
.addReg(SrcReg)
@@ -1085,7 +1330,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
.addReg(ExtReg)
.addImm(SrcSize << 16);
- return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
}
unsigned Mask;
@@ -1099,16 +1345,58 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
.addImm(SrcSize << 16);
}
- return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
}
return false;
}
+static int64_t getFPTrueImmVal(unsigned Size, bool Signed) {
+ switch (Size) {
+ case 16:
+ return Signed ? 0xBC00 : 0x3C00;
+ case 32:
+ return Signed ? 0xbf800000 : 0x3f800000;
+ case 64:
+ return Signed ? 0xbff0000000000000 : 0x3ff0000000000000;
+ default:
+ llvm_unreachable("Invalid FP type size");
+ }
+}
+
+bool AMDGPUInstructionSelector::selectG_SITOFP_UITOFP(MachineInstr &I) const {
+ MachineBasicBlock *MBB = I.getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ Register Src = I.getOperand(1).getReg();
+ if (!isSCC(Src, MRI))
+ return selectImpl(I, *CoverageInfo);
+
+ bool Signed = I.getOpcode() == AMDGPU::G_SITOFP;
+ Register DstReg = I.getOperand(0).getReg();
+ const LLT DstTy = MRI.getType(DstReg);
+ const unsigned DstSize = DstTy.getSizeInBits();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
+ .addReg(Src);
+
+ unsigned NewOpc =
+ DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
+ auto MIB = BuildMI(*MBB, I, DL, TII.get(NewOpc), DstReg)
+ .addImm(0)
+ .addImm(getFPTrueImmVal(DstSize, Signed));
+
+ if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineOperand &ImmOp = I.getOperand(1);
// The AMDGPU backend only supports Imm operands and not CImm or FPImm.
@@ -1119,15 +1407,15 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue());
}
- unsigned DstReg = I.getOperand(0).getReg();
+ Register DstReg = I.getOperand(0).getReg();
unsigned Size;
bool IsSgpr;
- const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg());
+ const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
if (RB) {
IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
- Size = MRI.getType(DstReg).getSizeInBits();
+ Size = MRI->getType(DstReg).getSizeInBits();
} else {
- const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg);
+ const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
IsSgpr = TRI.isSGPRClass(RC);
Size = TRI.getRegSizeInBits(*RC);
}
@@ -1142,34 +1430,41 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
- DebugLoc DL = I.getDebugLoc();
- const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass :
- &AMDGPU::VGPR_32RegClass;
- unsigned LoReg = MRI.createVirtualRegister(RC);
- unsigned HiReg = MRI.createVirtualRegister(RC);
- const APInt &Imm = APInt(Size, I.getOperand(1).getImm());
-
- BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
- .addImm(Imm.trunc(32).getZExtValue());
+ const DebugLoc &DL = I.getDebugLoc();
- BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
- .addImm(Imm.ashr(32).getZExtValue());
+ APInt Imm(Size, I.getOperand(1).getImm());
- const MachineInstr *RS =
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
- .addReg(LoReg)
- .addImm(AMDGPU::sub0)
- .addReg(HiReg)
- .addImm(AMDGPU::sub1);
+ MachineInstr *ResInst;
+ if (IsSgpr && TII.isInlineConstant(Imm)) {
+ ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
+ .addImm(I.getOperand(1).getImm());
+ } else {
+ const TargetRegisterClass *RC = IsSgpr ?
+ &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
+ Register LoReg = MRI->createVirtualRegister(RC);
+ Register HiReg = MRI->createVirtualRegister(RC);
+
+ BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
+ .addImm(Imm.trunc(32).getZExtValue());
+
+ BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
+ .addImm(Imm.ashr(32).getZExtValue());
+
+ ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+ .addReg(LoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(HiReg)
+ .addImm(AMDGPU::sub1);
+ }
// We can't call constrainSelectedInstRegOperands here, because it doesn't
// work for target independent opcodes
I.eraseFromParent();
const TargetRegisterClass *DstRC =
- TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI);
+ TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
if (!DstRC)
return true;
- return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
+ return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
}
static bool isConstant(const MachineInstr &MI) {
@@ -1188,13 +1483,13 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
GEPInfo GEPInfo(*PtrMI);
- for (unsigned i = 1, e = 3; i < e; ++i) {
+ for (unsigned i = 1; i != 3; ++i) {
const MachineOperand &GEPOp = PtrMI->getOperand(i);
const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
assert(OpDef);
- if (isConstant(*OpDef)) {
- // FIXME: Is it possible to have multiple Imm parts? Maybe if we
- // are lacking other optimizations.
+ if (i == 2 && isConstant(*OpDef)) {
+ // TODO: Could handle constant base + variable offset, but a combine
+ // probably should have commuted it.
assert(GEPInfo.Imm == 0);
GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
continue;
@@ -1240,16 +1535,26 @@ bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
return false;
}
-bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
- // TODO: Can/should we insert m0 initialization here for DS instructions and
- // call the normal selector?
- return false;
+void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+
+ const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
+ unsigned AS = PtrTy.getAddressSpace();
+ if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
+ STI.ldsRequiresM0Init()) {
+ // If DS instructions require M0 initializtion, insert it before selecting.
+ BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addImm(-1);
+ }
+}
+
+bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
+ initM0(I);
+ return selectImpl(I, *CoverageInfo);
}
bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineOperand &CondOp = I.getOperand(0);
Register CondReg = CondOp.getReg();
const DebugLoc &DL = I.getDebugLoc();
@@ -1263,11 +1568,12 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
// GlobalISel, we should push that decision into RegBankSelect. Assume for now
// RegBankSelect knows what it's doing if the branch condition is scc, even
// though it currently does not.
- if (isSCC(CondReg, MRI)) {
+ if (isSCC(CondReg, *MRI)) {
CondPhysReg = AMDGPU::SCC;
BrOpcode = AMDGPU::S_CBRANCH_SCC1;
- ConstrainRC = &AMDGPU::SReg_32_XM0RegClass;
- } else if (isVCC(CondReg, MRI)) {
+ // FIXME: Hack for isSCC tests
+ ConstrainRC = &AMDGPU::SGPR_32RegClass;
+ } else if (isVCC(CondReg, *MRI)) {
// FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
// We sort of know that a VCC producer based on the register bank, that ands
// inactive lanes with 0. What if there was a logical operation with vcc
@@ -1279,8 +1585,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
} else
return false;
- if (!MRI.getRegClassOrNull(CondReg))
- MRI.setRegClass(CondReg, ConstrainRC);
+ if (!MRI->getRegClassOrNull(CondReg))
+ MRI->setRegClass(CondReg, ConstrainRC);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
.addReg(CondReg);
@@ -1292,27 +1598,83 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
}
bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
Register DstReg = I.getOperand(0).getReg();
- const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
if (IsVGPR)
I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
return RBI.constrainGenericRegister(
- DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI);
+ DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
+}
+
+bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const {
+ uint64_t Align = I.getOperand(2).getImm();
+ const uint64_t Mask = ~((UINT64_C(1) << Align) - 1);
+
+ MachineBasicBlock *BB = I.getParent();
+
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
+ const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
+ unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
+ const TargetRegisterClass &RegRC
+ = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
+
+ LLT Ty = MRI->getType(DstReg);
+
+ const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
+ *MRI);
+ const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
+ *MRI);
+ if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
+ !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
+ return false;
+
+ const DebugLoc &DL = I.getDebugLoc();
+ Register ImmReg = MRI->createVirtualRegister(&RegRC);
+ BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg)
+ .addImm(Mask);
+
+ if (Ty.getSizeInBits() == 32) {
+ BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
+ .addReg(SrcReg)
+ .addReg(ImmReg);
+ I.eraseFromParent();
+ return true;
+ }
+
+ Register HiReg = MRI->createVirtualRegister(&RegRC);
+ Register LoReg = MRI->createVirtualRegister(&RegRC);
+ Register MaskLo = MRI->createVirtualRegister(&RegRC);
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
+ .addReg(SrcReg, 0, AMDGPU::sub0);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
+ .addReg(SrcReg, 0, AMDGPU::sub1);
+
+ BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo)
+ .addReg(LoReg)
+ .addReg(ImmReg);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+ .addReg(MaskLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(HiReg)
+ .addImm(AMDGPU::sub1);
+ I.eraseFromParent();
+ return true;
}
-bool AMDGPUInstructionSelector::select(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
+bool AMDGPUInstructionSelector::select(MachineInstr &I) {
if (I.isPHI())
return selectPHI(I);
- if (!isPreISelGenericOpcode(I.getOpcode())) {
+ if (!I.isPreISelOpcode()) {
if (I.isCopy())
return selectCOPY(I);
return true;
@@ -1324,16 +1686,18 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_XOR:
if (selectG_AND_OR_XOR(I))
return true;
- return selectImpl(I, CoverageInfo);
+ return selectImpl(I, *CoverageInfo);
case TargetOpcode::G_ADD:
case TargetOpcode::G_SUB:
- if (selectG_ADD_SUB(I))
+ if (selectImpl(I, *CoverageInfo))
return true;
- LLVM_FALLTHROUGH;
- default:
- return selectImpl(I, CoverageInfo);
+ return selectG_ADD_SUB(I);
+ case TargetOpcode::G_UADDO:
+ case TargetOpcode::G_USUBO:
+ return selectG_UADDO_USUBO(I);
case TargetOpcode::G_INTTOPTR:
case TargetOpcode::G_BITCAST:
+ case TargetOpcode::G_PTRTOINT:
return selectCOPY(I);
case TargetOpcode::G_CONSTANT:
case TargetOpcode::G_FCONSTANT:
@@ -1353,32 +1717,40 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_INSERT:
return selectG_INSERT(I);
case TargetOpcode::G_INTRINSIC:
- return selectG_INTRINSIC(I, CoverageInfo);
+ return selectG_INTRINSIC(I);
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
- return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo);
+ return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
case TargetOpcode::G_ICMP:
if (selectG_ICMP(I))
return true;
- return selectImpl(I, CoverageInfo);
+ return selectImpl(I, *CoverageInfo);
case TargetOpcode::G_LOAD:
- return selectImpl(I, CoverageInfo);
+ case TargetOpcode::G_ATOMIC_CMPXCHG:
+ case TargetOpcode::G_ATOMICRMW_XCHG:
+ case TargetOpcode::G_ATOMICRMW_ADD:
+ case TargetOpcode::G_ATOMICRMW_SUB:
+ case TargetOpcode::G_ATOMICRMW_AND:
+ case TargetOpcode::G_ATOMICRMW_OR:
+ case TargetOpcode::G_ATOMICRMW_XOR:
+ case TargetOpcode::G_ATOMICRMW_MIN:
+ case TargetOpcode::G_ATOMICRMW_MAX:
+ case TargetOpcode::G_ATOMICRMW_UMIN:
+ case TargetOpcode::G_ATOMICRMW_UMAX:
+ case TargetOpcode::G_ATOMICRMW_FADD:
+ return selectG_LOAD_ATOMICRMW(I);
case TargetOpcode::G_SELECT:
return selectG_SELECT(I);
case TargetOpcode::G_STORE:
- if (selectImpl(I, CoverageInfo))
- return true;
return selectG_STORE(I);
case TargetOpcode::G_TRUNC:
return selectG_TRUNC(I);
case TargetOpcode::G_SEXT:
case TargetOpcode::G_ZEXT:
case TargetOpcode::G_ANYEXT:
- if (selectG_SZA_EXT(I)) {
- I.eraseFromParent();
- return true;
- }
-
- return false;
+ return selectG_SZA_EXT(I);
+ case TargetOpcode::G_SITOFP:
+ case TargetOpcode::G_UITOFP:
+ return selectG_SITOFP_UITOFP(I);
case TargetOpcode::G_BRCOND:
return selectG_BRCOND(I);
case TargetOpcode::G_FRAME_INDEX:
@@ -1388,6 +1760,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
// is checking for G_CONSTANT
I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE));
return true;
+ case TargetOpcode::G_PTR_MASK:
+ return selectG_PTR_MASK(I);
+ default:
+ return selectImpl(I, *CoverageInfo);
}
return false;
}
@@ -1402,14 +1778,14 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3ModsImpl(
- Register Src, const MachineRegisterInfo &MRI) const {
+ Register Src) const {
unsigned Mods = 0;
- MachineInstr *MI = MRI.getVRegDef(Src);
+ MachineInstr *MI = MRI->getVRegDef(Src);
if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
Src = MI->getOperand(1).getReg();
Mods |= SISrcMods::NEG;
- MI = MRI.getVRegDef(Src);
+ MI = MRI->getVRegDef(Src);
}
if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
@@ -1432,12 +1808,23 @@ AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
- MachineRegisterInfo &MRI
- = Root.getParent()->getParent()->getParent()->getRegInfo();
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
+ }};
+}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1446,6 +1833,7 @@ AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
}};
}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
return {{
@@ -1457,12 +1845,9 @@ AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
- MachineRegisterInfo &MRI
- = Root.getParent()->getParent()->getParent()->getRegInfo();
-
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1471,12 +1856,28 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
- MachineRegisterInfo &MRI =
- Root.getParent()->getParent()->getParent()->getRegInfo();
+AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const {
+ // FIXME: Handle clamp and op_sel
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp
+ }};
+}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
+ // FIXME: Handle op_sel
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
SmallVector<GEPInfo, 4> AddrInfo;
- getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
+ getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
return None;
@@ -1496,11 +1897,8 @@ AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
- MachineRegisterInfo &MRI =
- Root.getParent()->getParent()->getParent()->getRegInfo();
-
SmallVector<GEPInfo, 4> AddrInfo;
- getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
+ getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
return None;
@@ -1521,10 +1919,9 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
SmallVector<GEPInfo, 4> AddrInfo;
- getAddrModeInfo(*MI, MRI, AddrInfo);
+ getAddrModeInfo(*MI, *MRI, AddrInfo);
// FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
// then we can select all ptr + 32-bit offsets not just immediate offsets.
@@ -1540,7 +1937,7 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
// failed trying to select this load into one of the _IMM variants since
// the _IMM Patterns are considered before the _SGPR patterns.
unsigned PtrReg = GEPInfo.SgprParts[0];
- unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(GEPInfo.Imm);
return {{
@@ -1553,8 +1950,6 @@ template <bool Signed>
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
- MachineBasicBlock *MBB = MI->getParent();
- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
InstructionSelector::ComplexRendererFns Default = {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
@@ -1565,12 +1960,12 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
if (!STI.hasFlatInstOffsets())
return Default;
- const MachineInstr *OpDef = MRI.getVRegDef(Root.getReg());
+ const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
if (!OpDef || OpDef->getOpcode() != AMDGPU::G_GEP)
return Default;
Optional<int64_t> Offset =
- getConstantVRegVal(OpDef->getOperand(2).getReg(), MRI);
+ getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
if (!Offset.hasValue())
return Default;
@@ -1597,12 +1992,6 @@ AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
return selectFlatOffsetImpl<true>(Root);
}
-// FIXME: Implement
-static bool signBitIsZero(const MachineOperand &Op,
- const MachineRegisterInfo &MRI) {
- return false;
-}
-
static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
return PSV && PSV->isStack();
@@ -1613,12 +2002,11 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MBB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
int64_t Offset = 0;
- if (mi_match(Root.getReg(), MRI, m_ICst(Offset))) {
- Register HighBits = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) {
+ Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
// TODO: Should this be inside the render function? The iterator seems to
// move.
@@ -1652,18 +2040,18 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
// offsets.
Optional<int> FI;
Register VAddr = Root.getReg();
- if (const MachineInstr *RootDef = MRI.getVRegDef(Root.getReg())) {
- if (isBaseWithConstantOffset(Root, MRI)) {
+ if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
+ if (isBaseWithConstantOffset(Root, *MRI)) {
const MachineOperand &LHS = RootDef->getOperand(1);
const MachineOperand &RHS = RootDef->getOperand(2);
- const MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
- const MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
+ const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
+ const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
if (LHSDef && RHSDef) {
int64_t PossibleOffset =
RHSDef->getOperand(1).getCImm()->getSExtValue();
if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
(!STI.privateMemoryResourceIsRangeChecked() ||
- signBitIsZero(LHS, MRI))) {
+ KnownBits->signBitIsZero(LHS.getReg()))) {
if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
FI = LHSDef->getOperand(1).getIndex();
else
@@ -1700,15 +2088,30 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
}}};
}
+bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI,
+ const MachineOperand &Base,
+ int64_t Offset,
+ unsigned OffsetBits) const {
+ if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
+ (OffsetBits == 8 && !isUInt<8>(Offset)))
+ return false;
+
+ if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
+ return true;
+
+ // On Southern Islands instruction with a negative base value and an offset
+ // don't seem to work.
+ return KnownBits->signBitIsZero(Base.getReg());
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectMUBUFScratchOffset(
MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
int64_t Offset = 0;
- if (!mi_match(Root.getReg(), MRI, m_ICst(Offset)) ||
+ if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
return {};
@@ -1728,3 +2131,54 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
}};
}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
+ const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
+ if (!RootDef) {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
+ }};
+ }
+
+ int64_t ConstAddr = 0;
+ if (isBaseWithConstantOffset(Root, *MRI)) {
+ const MachineOperand &LHS = RootDef->getOperand(1);
+ const MachineOperand &RHS = RootDef->getOperand(2);
+ const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
+ const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
+ if (LHSDef && RHSDef) {
+ int64_t PossibleOffset =
+ RHSDef->getOperand(1).getCImm()->getSExtValue();
+ if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) {
+ // (add n0, c0)
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); }
+ }};
+ }
+ }
+ } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
+
+
+
+ } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+
+
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
+ }};
+}
+
+void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
+ const MachineInstr &MI) const {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+ Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI);
+ assert(CstVal && "Expected constant value");
+ MIB.addImm(CstVal.getValue());
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 4f489ddfb23d..d3c83a6a872a 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -35,6 +35,7 @@ class AMDGPUInstrInfo;
class AMDGPURegisterBankInfo;
class GCNSubtarget;
class MachineInstr;
+class MachineIRBuilder;
class MachineOperand;
class MachineRegisterInfo;
class SIInstrInfo;
@@ -42,14 +43,20 @@ class SIMachineFunctionInfo;
class SIRegisterInfo;
class AMDGPUInstructionSelector : public InstructionSelector {
+private:
+ MachineRegisterInfo *MRI;
+
public:
AMDGPUInstructionSelector(const GCNSubtarget &STI,
const AMDGPURegisterBankInfo &RBI,
const AMDGPUTargetMachine &TM);
- bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ bool select(MachineInstr &I) override;
static const char *getName();
+ void setupMF(MachineFunction &MF, GISelKnownBits &KB,
+ CodeGenCoverage &CoverageInfo) override;
+
private:
struct GEPInfo {
const MachineInstr &GEP;
@@ -72,32 +79,42 @@ private:
bool selectPHI(MachineInstr &I) const;
bool selectG_TRUNC(MachineInstr &I) const;
bool selectG_SZA_EXT(MachineInstr &I) const;
+ bool selectG_SITOFP_UITOFP(MachineInstr &I) const;
bool selectG_CONSTANT(MachineInstr &I) const;
bool selectG_AND_OR_XOR(MachineInstr &I) const;
bool selectG_ADD_SUB(MachineInstr &I) const;
+ bool selectG_UADDO_USUBO(MachineInstr &I) const;
bool selectG_EXTRACT(MachineInstr &I) const;
bool selectG_MERGE_VALUES(MachineInstr &I) const;
bool selectG_UNMERGE_VALUES(MachineInstr &I) const;
bool selectG_GEP(MachineInstr &I) const;
bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
bool selectG_INSERT(MachineInstr &I) const;
- bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
- bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const;
+ bool selectG_INTRINSIC(MachineInstr &I) const;
+
+ std::tuple<Register, unsigned, unsigned>
+ splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
+
+ bool selectStoreIntrinsic(MachineInstr &MI, bool IsFormat) const;
+
+ bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const;
int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const;
bool selectG_ICMP(MachineInstr &I) const;
bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
SmallVectorImpl<GEPInfo> &AddrInfo) const;
bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const;
- bool selectG_LOAD(MachineInstr &I) const;
- bool selectG_SELECT(MachineInstr &I) const;
+
+ void initM0(MachineInstr &I) const;
+ bool selectG_LOAD_ATOMICRMW(MachineInstr &I) const;
bool selectG_STORE(MachineInstr &I) const;
+ bool selectG_SELECT(MachineInstr &I) const;
bool selectG_BRCOND(MachineInstr &I) const;
bool selectG_FRAME_INDEX(MachineInstr &I) const;
+ bool selectG_PTR_MASK(MachineInstr &I) const;
std::pair<Register, unsigned>
- selectVOP3ModsImpl(Register Src, const MachineRegisterInfo &MRI) const;
+ selectVOP3ModsImpl(Register Src) const;
InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand &Root) const;
@@ -108,11 +125,18 @@ private:
InstructionSelector::ComplexRendererFns
selectVOP3Mods0(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
selectVOP3OMods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectVOP3OpSelMods0(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3OpSelMods(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectSmrdImm32(MachineOperand &Root) const;
@@ -133,6 +157,16 @@ private:
InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffset(MachineOperand &Root) const;
+ bool isDSOffsetLegal(const MachineRegisterInfo &MRI,
+ const MachineOperand &Base,
+ int64_t Offset, unsigned OffsetBits) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectDS1Addr1Offset(MachineOperand &Root) const;
+
+ void renderTruncImm32(MachineInstrBuilder &MIB,
+ const MachineInstr &MI) const;
+
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
const AMDGPURegisterBankInfo &RBI;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 61bc415c839d..846e7f577a28 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -75,7 +75,7 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
let isCodeGenOnly = 1;
}
-def TruePredicate : Predicate<"true">;
+def TruePredicate : Predicate<"">;
class PredicateControl {
Predicate SubtargetPredicate = TruePredicate;
@@ -220,80 +220,48 @@ def hi_f16_elt : PatLeaf<
// PatLeafs for floating-point comparisons
//===----------------------------------------------------------------------===//
-def COND_OEQ : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
->;
-
-def COND_ONE : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}]
->;
-
-def COND_OGT : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
->;
-
-def COND_OGE : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}]
->;
-
-def COND_OLT : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}]
->;
-
-def COND_OLE : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
->;
-
-def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
-def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
+def COND_OEQ : PatFrags<(ops), [(OtherVT SETOEQ), (OtherVT SETEQ)]>;
+def COND_ONE : PatFrags<(ops), [(OtherVT SETONE), (OtherVT SETNE)]>;
+def COND_OGT : PatFrags<(ops), [(OtherVT SETOGT), (OtherVT SETGT)]>;
+def COND_OGE : PatFrags<(ops), [(OtherVT SETOGE), (OtherVT SETGE)]>;
+def COND_OLT : PatFrags<(ops), [(OtherVT SETOLT), (OtherVT SETLT)]>;
+def COND_OLE : PatFrags<(ops), [(OtherVT SETOLE), (OtherVT SETLE)]>;
+def COND_O : PatFrags<(ops), [(OtherVT SETO)]>;
+def COND_UO : PatFrags<(ops), [(OtherVT SETUO)]>;
//===----------------------------------------------------------------------===//
// PatLeafs for unsigned / unordered comparisons
//===----------------------------------------------------------------------===//
-def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>;
-def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>;
-def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
-def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
-def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
-def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
+def COND_UEQ : PatFrag<(ops), (OtherVT SETUEQ)>;
+def COND_UNE : PatFrag<(ops), (OtherVT SETUNE)>;
+def COND_UGT : PatFrag<(ops), (OtherVT SETUGT)>;
+def COND_UGE : PatFrag<(ops), (OtherVT SETUGE)>;
+def COND_ULT : PatFrag<(ops), (OtherVT SETULT)>;
+def COND_ULE : PatFrag<(ops), (OtherVT SETULE)>;
// XXX - For some reason R600 version is preferring to use unordered
// for setne?
-def COND_UNE_NE : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
->;
+def COND_UNE_NE : PatFrags<(ops), [(OtherVT SETUNE), (OtherVT SETNE)]>;
//===----------------------------------------------------------------------===//
// PatLeafs for signed comparisons
//===----------------------------------------------------------------------===//
-def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>;
-def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>;
-def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>;
-def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>;
+def COND_SGT : PatFrag<(ops), (OtherVT SETGT)>;
+def COND_SGE : PatFrag<(ops), (OtherVT SETGE)>;
+def COND_SLT : PatFrag<(ops), (OtherVT SETLT)>;
+def COND_SLE : PatFrag<(ops), (OtherVT SETLE)>;
//===----------------------------------------------------------------------===//
// PatLeafs for integer equality
//===----------------------------------------------------------------------===//
-def COND_EQ : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}]
->;
-
-def COND_NE : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}]
->;
+def COND_EQ : PatFrags<(ops), [(OtherVT SETEQ), (OtherVT SETUEQ)]>;
+def COND_NE : PatFrags<(ops), [(OtherVT SETNE), (OtherVT SETUNE)]>;
+// FIXME: Should not need code predicate
+//def COND_NULL : PatLeaf<(OtherVT null_frag)>;
def COND_NULL : PatLeaf <
(cond),
[{(void)N; return false;}]
@@ -335,17 +303,17 @@ def TEX_SHADOW_ARRAY : PatLeaf<
// Load/Store Pattern Fragments
//===----------------------------------------------------------------------===//
+def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
class AddressSpaceList<list<int> AS> {
list<int> AddrSpaces = AS;
}
-class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
- return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
-}]>;
-
-class Aligned16Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
- return cast<MemSDNode>(N)->getAlignment() >= 16;
-}]>;
+class Aligned<int Bytes> {
+ int MinAlignment = Bytes;
+}
class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>;
@@ -502,6 +470,35 @@ defm atomic_store_#as : binary_atomic_op<atomic_store>;
} // End foreach AddrSpace
+multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
+ foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
+ let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
+ defm "_"#as : binary_atomic_op<atomic_op, IsInt>;
+
+ let PredicateCode = [{return (SDValue(N, 0).use_empty());}] in {
+ defm "_"#as#"_noret" : binary_atomic_op<atomic_op, IsInt>;
+ }
+
+ let PredicateCode = [{return !(SDValue(N, 0).use_empty());}] in {
+ defm "_"#as#"_ret" : binary_atomic_op<atomic_op, IsInt>;
+ }
+ }
+ }
+}
+
+defm atomic_swap : ret_noret_binary_atomic_op<atomic_swap>;
+defm atomic_load_add : ret_noret_binary_atomic_op<atomic_load_add>;
+defm atomic_load_and : ret_noret_binary_atomic_op<atomic_load_and>;
+defm atomic_load_max : ret_noret_binary_atomic_op<atomic_load_max>;
+defm atomic_load_min : ret_noret_binary_atomic_op<atomic_load_min>;
+defm atomic_load_or : ret_noret_binary_atomic_op<atomic_load_or>;
+defm atomic_load_sub : ret_noret_binary_atomic_op<atomic_load_sub>;
+defm atomic_load_umax : ret_noret_binary_atomic_op<atomic_load_umax>;
+defm atomic_load_umin : ret_noret_binary_atomic_op<atomic_load_umin>;
+defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>;
+defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
+
+
def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress;
def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress;
@@ -513,21 +510,31 @@ def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
def atomic_store_local : LocalStore <atomic_store>;
-def load_align8_local : Aligned8Bytes <
- (ops node:$ptr), (load_local node:$ptr)
->;
-def load_align16_local : Aligned16Bytes <
- (ops node:$ptr), (load_local node:$ptr)
->;
+def load_align8_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+ let MinAlignment = 8;
+}
-def store_align8_local : Aligned8Bytes <
- (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
->;
+def load_align16_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+ let MinAlignment = 16;
+}
+
+def store_align8_local: PatFrag<(ops node:$val, node:$ptr),
+ (store_local node:$val, node:$ptr)>, Aligned<8> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
+
+def store_align16_local: PatFrag<(ops node:$val, node:$ptr),
+ (store_local node:$val, node:$ptr)>, Aligned<16> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
-def store_align16_local : Aligned16Bytes <
- (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
->;
def atomic_store_flat : FlatStore <atomic_store>;
def truncstorei8_hi16_flat : StoreHi16<truncstorei8>, FlatStoreAddress;
@@ -547,69 +554,26 @@ class region_binary_atomic_op<SDNode atomic_op> :
}]>;
-def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
-def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
-def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
-def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>;
-def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>;
-def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>;
-def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>;
-def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>;
-def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>;
-def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>;
-def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
-
def mskor_global : PatFrag<(ops node:$val, node:$ptr),
(AMDGPUstore_mskor node:$val, node:$ptr), [{
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
}]>;
-class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag<
- (ops node:$ptr, node:$cmp, node:$swap),
- (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
- AtomicSDNode *AN = cast<AtomicSDNode>(N);
- return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
-
-class AtomicCmpSwapRegion <SDNode cmp_swap_node> : PatFrag<
- (ops node:$ptr, node:$cmp, node:$swap),
- (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
- AtomicSDNode *AN = cast<AtomicSDNode>(N);
- return AN->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
-}]>;
+let AddressSpaces = StoreAddress_local.AddrSpaces in {
+defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
+}
-def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>;
+let AddressSpaces = StoreAddress_region.AddrSpaces in {
+defm atomic_cmp_swap_region : ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
+}
class global_binary_atomic_op_frag<SDNode atomic_op> : PatFrag<
(ops node:$ptr, node:$value),
(atomic_op node:$ptr, node:$value),
[{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
-multiclass global_binary_atomic_op<SDNode atomic_op> {
- def "" : global_binary_atomic_op_frag<atomic_op>;
-
- def _noret : PatFrag<
- (ops node:$ptr, node:$value),
- (atomic_op node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
-
- def _ret : PatFrag<
- (ops node:$ptr, node:$value),
- (atomic_op node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
-}
-
-defm atomic_swap_global : global_binary_atomic_op<atomic_swap>;
-defm atomic_add_global : global_binary_atomic_op<atomic_load_add>;
-defm atomic_and_global : global_binary_atomic_op<atomic_load_and>;
-defm atomic_max_global : global_binary_atomic_op<atomic_load_max>;
-defm atomic_min_global : global_binary_atomic_op<atomic_load_min>;
-defm atomic_or_global : global_binary_atomic_op<atomic_load_or>;
-defm atomic_sub_global : global_binary_atomic_op<atomic_load_sub>;
-defm atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
-defm atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
-defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
-
// Legacy.
def AMDGPUatomic_cmp_swap_global : PatFrag<
(ops node:$ptr, node:$value),
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 670f6225fbf7..5aba35a19ced 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -11,6 +11,13 @@
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
+#if defined(_MSC_VER) || defined(__MINGW32__)
+// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
+// from the Visual C++ cmath / math.h headers:
+// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
+#define _USE_MATH_DEFINES
+#endif
+
#include "AMDGPU.h"
#include "AMDGPULegalizerInfo.h"
#include "AMDGPUTargetMachine.h"
@@ -20,6 +27,7 @@
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
@@ -32,7 +40,7 @@ using namespace LegalityPredicates;
static LegalityPredicate isMultiple32(unsigned TypeIdx,
- unsigned MaxSize = 512) {
+ unsigned MaxSize = 1024) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
const LLT EltTy = Ty.getScalarType();
@@ -40,12 +48,27 @@ static LegalityPredicate isMultiple32(unsigned TypeIdx,
};
}
+static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
+ return [=](const LegalityQuery &Query) {
+ return Query.Types[TypeIdx].getSizeInBits() == Size;
+ };
+}
+
static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
return Ty.isVector() &&
Ty.getNumElements() % 2 != 0 &&
- Ty.getElementType().getSizeInBits() < 32;
+ Ty.getElementType().getSizeInBits() < 32 &&
+ Ty.getSizeInBits() % 32 != 0;
+ };
+}
+
+static LegalityPredicate isWideVec16(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ const LLT EltTy = Ty.getScalarType();
+ return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
};
}
@@ -68,6 +91,31 @@ static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
};
}
+// Increase the number of vector elements to reach the next multiple of 32-bit
+// type.
+static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+
+ const LLT EltTy = Ty.getElementType();
+ const int Size = Ty.getSizeInBits();
+ const int EltSize = EltTy.getSizeInBits();
+ const int NextMul32 = (Size + 31) / 32;
+
+ assert(EltSize < 32);
+
+ const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
+ return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
+ };
+}
+
+static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
+ return [=](const LegalityQuery &Query) {
+ const LLT QueryTy = Query.Types[TypeIdx];
+ return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
+ };
+}
+
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
return [=](const LegalityQuery &Query) {
const LLT QueryTy = Query.Types[TypeIdx];
@@ -82,7 +130,7 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
};
}
-// Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
+// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
// v2s16.
static LegalityPredicate isRegisterType(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
@@ -94,7 +142,21 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) {
EltSize == 128 || EltSize == 256;
}
- return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
+ return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
+ };
+}
+
+static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
+ return [=](const LegalityQuery &Query) {
+ return Query.Types[TypeIdx].getElementType() == Type;
+ };
+}
+
+static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
+ Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
};
}
@@ -112,9 +174,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
+ const LLT S96 = LLT::scalar(96);
const LLT S128 = LLT::scalar(128);
const LLT S256 = LLT::scalar(256);
- const LLT S512 = LLT::scalar(512);
+ const LLT S1024 = LLT::scalar(1024);
const LLT V2S16 = LLT::vector(2, 16);
const LLT V4S16 = LLT::vector(4, 16);
@@ -134,6 +197,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT V14S32 = LLT::vector(14, 32);
const LLT V15S32 = LLT::vector(15, 32);
const LLT V16S32 = LLT::vector(16, 32);
+ const LLT V32S32 = LLT::vector(32, 32);
const LLT V2S64 = LLT::vector(2, 64);
const LLT V3S64 = LLT::vector(3, 64);
@@ -142,16 +206,19 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT V6S64 = LLT::vector(6, 64);
const LLT V7S64 = LLT::vector(7, 64);
const LLT V8S64 = LLT::vector(8, 64);
+ const LLT V16S64 = LLT::vector(16, 64);
std::initializer_list<LLT> AllS32Vectors =
{V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
- V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
+ V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
std::initializer_list<LLT> AllS64Vectors =
- {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
+ {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
+ const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
+ const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
@@ -162,7 +229,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
};
const std::initializer_list<LLT> AddrSpaces32 = {
- LocalPtr, PrivatePtr
+ LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
};
const std::initializer_list<LLT> FPTypesBase = {
@@ -216,37 +283,34 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
.clampScalar(0, S32, S64)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
+ .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
.widenScalarToNextPow2(0)
.scalarize(0);
- getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
+ getActionDefinitionsBuilder({G_UADDO, G_USUBO,
G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
.legalFor({{S32, S1}})
- .clampScalar(0, S32, S32);
+ .clampScalar(0, S32, S32)
+ .scalarize(0); // TODO: Implement.
+
+ getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
+ .lower();
getActionDefinitionsBuilder(G_BITCAST)
- .legalForCartesianProduct({S32, V2S16})
- .legalForCartesianProduct({S64, V2S32, V4S16})
- .legalForCartesianProduct({V2S64, V4S32})
// Don't worry about the size constraint.
- .legalIf(all(isPointer(0), isPointer(1)));
+ .legalIf(all(isRegisterType(0), isRegisterType(1)))
+ // FIXME: Testing hack
+ .legalForCartesianProduct({S16, LLT::vector(2, 8), });
- if (ST.has16BitInsts()) {
- getActionDefinitionsBuilder(G_FCONSTANT)
- .legalFor({S32, S64, S16})
- .clampScalar(0, S16, S64);
- } else {
- getActionDefinitionsBuilder(G_FCONSTANT)
- .legalFor({S32, S64})
- .clampScalar(0, S32, S64);
- }
+ getActionDefinitionsBuilder(G_FCONSTANT)
+ .legalFor({S32, S64, S16})
+ .clampScalar(0, S16, S64);
getActionDefinitionsBuilder(G_IMPLICIT_DEF)
- .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
+ .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .clampScalarOrElt(0, S32, S512)
+ .clampScalarOrElt(0, S32, S1024)
.legalIf(isMultiple32(0))
.widenScalarToNextPow2(0, 32)
.clampMaxNumElements(0, S32, 16);
@@ -256,23 +320,33 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// values may not be legal. We need to figure out how to distinguish
// between these two scenarios.
getActionDefinitionsBuilder(G_CONSTANT)
- .legalFor({S1, S32, S64, GlobalPtr,
+ .legalFor({S1, S32, S64, S16, GlobalPtr,
LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
.clampScalar(0, S32, S64)
.widenScalarToNextPow2(0)
.legalIf(isPointer(0));
setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
+ getActionDefinitionsBuilder(G_GLOBAL_VALUE)
+ .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
+
auto &FPOpActions = getActionDefinitionsBuilder(
- { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
+ { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
.legalFor({S32, S64});
+ auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
+ .customFor({S32, S64});
+ auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
+ .customFor({S32, S64});
if (ST.has16BitInsts()) {
if (ST.hasVOP3PInsts())
FPOpActions.legalFor({S16, V2S16});
else
FPOpActions.legalFor({S16});
+
+ TrigActions.customFor({S16});
+ FDIVActions.customFor({S16});
}
auto &MinNumMaxNum = getActionDefinitionsBuilder({
@@ -293,22 +367,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
}
- // TODO: Implement
- getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
-
if (ST.hasVOP3PInsts())
FPOpActions.clampMaxNumElements(0, S16, 2);
+
FPOpActions
.scalarize(0)
.clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+ TrigActions
+ .scalarize(0)
+ .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+
+ FDIVActions
+ .scalarize(0)
+ .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+
+ getActionDefinitionsBuilder({G_FNEG, G_FABS})
+ .legalFor(FPTypesPK16)
+ .clampMaxNumElements(0, S16, 2)
+ .scalarize(0)
+ .clampScalar(0, S16, S64);
+
+ // TODO: Implement
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
+
if (ST.has16BitInsts()) {
- getActionDefinitionsBuilder(G_FSQRT)
+ getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
.legalFor({S32, S64, S16})
.scalarize(0)
.clampScalar(0, S16, S64);
} else {
- getActionDefinitionsBuilder(G_FSQRT)
+ getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
.legalFor({S32, S64})
.scalarize(0)
.clampScalar(0, S32, S64);
@@ -334,23 +423,43 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.clampScalar(0, S32, S64);
+ // Whether this is legal depends on the floating point mode for the function.
+ auto &FMad = getActionDefinitionsBuilder(G_FMAD);
+ if (ST.hasMadF16())
+ FMad.customFor({S32, S16});
+ else
+ FMad.customFor({S32});
+ FMad.scalarize(0)
+ .lower();
+
getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
.legalFor({{S64, S32}, {S32, S16}, {S64, S16},
{S32, S1}, {S64, S1}, {S16, S1},
+ {S96, S32},
// FIXME: Hack
{S64, LLT::scalar(33)},
{S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
.scalarize(0);
- getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
- .legalFor({{S32, S32}, {S64, S32}})
+ // TODO: Split s1->s64 during regbankselect for VALU.
+ auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
+ .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
.lowerFor({{S32, S64}})
- .customFor({{S64, S64}})
- .scalarize(0);
+ .customFor({{S64, S64}});
+ if (ST.has16BitInsts())
+ IToFP.legalFor({{S16, S16}});
+ IToFP.clampScalar(1, S32, S64)
+ .scalarize(0);
+
+ auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
+ .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
+ if (ST.has16BitInsts())
+ FPToI.legalFor({{S16, S16}});
+ else
+ FPToI.minScalar(1, S32);
- getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
- .legalFor({{S32, S32}, {S32, S64}})
- .scalarize(0);
+ FPToI.minScalar(0, S32)
+ .scalarize(0);
getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
.legalFor({S32, S64})
@@ -374,6 +483,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalForCartesianProduct(AddrSpaces32, {S32})
.scalarize(0);
+ getActionDefinitionsBuilder(G_PTR_MASK)
+ .scalarize(0)
+ .alwaysLegal();
+
setAction({G_BLOCK_ADDR, CodePtr}, Legal);
auto &CmpBuilder =
@@ -415,7 +528,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(1, 32);
// TODO: Expand for > s32
- getActionDefinitionsBuilder(G_BSWAP)
+ getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
.legalFor({S32})
.clampScalar(0, S32, S32)
.scalarize(0);
@@ -491,87 +604,239 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
});
- if (ST.hasFlatAddressSpace()) {
- getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
- .scalarize(0)
- .custom();
- }
+ getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
+ .scalarize(0)
+ .custom();
// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
// handle some operations by just promoting the register during
// selection. There are also d16 loads on GFX9+ which preserve the high bits.
- getActionDefinitionsBuilder({G_LOAD, G_STORE})
- .narrowScalarIf([](const LegalityQuery &Query) {
- unsigned Size = Query.Types[0].getSizeInBits();
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
- return (Size > 32 && MemSize < Size);
- },
- [](const LegalityQuery &Query) {
- return std::make_pair(0, LLT::scalar(32));
- })
- .fewerElementsIf([=](const LegalityQuery &Query) {
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
- return (MemSize == 96) &&
- Query.Types[0].isVector() &&
- !ST.hasDwordx3LoadStores();
- },
- [=](const LegalityQuery &Query) {
- return std::make_pair(0, V2S32);
- })
- .legalIf([=](const LegalityQuery &Query) {
- const LLT &Ty0 = Query.Types[0];
-
- unsigned Size = Ty0.getSizeInBits();
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
- if (Size < 32 || (Size > 32 && MemSize < Size))
- return false;
-
- if (Ty0.isVector() && Size != MemSize)
- return false;
-
- // TODO: Decompose private loads into 4-byte components.
- // TODO: Illegal flat loads on SI
- switch (MemSize) {
- case 8:
- case 16:
- return Size == 32;
- case 32:
- case 64:
- case 128:
- return true;
+ auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
+ switch (AS) {
+ // FIXME: Private element size.
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ return 32;
+ // FIXME: Check subtarget
+ case AMDGPUAS::LOCAL_ADDRESS:
+ return ST.useDS128() ? 128 : 64;
+
+ // Treat constant and global as identical. SMRD loads are sometimes usable
+ // for global loads (ideally constant address space should be eliminated)
+ // depending on the context. Legality cannot be context dependent, but
+ // RegBankSelect can split the load as necessary depending on the pointer
+ // register bank/uniformity and if the memory is invariant or not written in
+ // a kernel.
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ return 512;
+ default:
+ return 128;
+ }
+ };
- case 96:
- return ST.hasDwordx3LoadStores();
-
- case 256:
- case 512:
- // TODO: Possibly support loads of i256 and i512 . This will require
- // adding i256 and i512 types to MVT in order for to be able to use
- // TableGen.
- // TODO: Add support for other vector types, this will require
- // defining more value mappings for the new types.
- return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
- Ty0.getScalarType().getSizeInBits() == 64);
-
- default:
- return false;
- }
- })
- .clampScalar(0, S32, S64);
+ const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
+ const LLT DstTy = Query.Types[0];
+
+ // Split vector extloads.
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
+ return true;
+
+ const LLT PtrTy = Query.Types[1];
+ unsigned AS = PtrTy.getAddressSpace();
+ if (MemSize > maxSizeForAddrSpace(AS))
+ return true;
+
+ // Catch weird sized loads that don't evenly divide into the access sizes
+ // TODO: May be able to widen depending on alignment etc.
+ unsigned NumRegs = MemSize / 32;
+ if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
+ return true;
+
+ unsigned Align = Query.MMODescrs[0].AlignInBits;
+ if (Align < MemSize) {
+ const SITargetLowering *TLI = ST.getTargetLowering();
+ return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
+ }
+
+ return false;
+ };
+ unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
+ unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
+ unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
+
+ // TODO: Refine based on subtargets which support unaligned access or 128-bit
+ // LDS
+ // TODO: Unsupported flat for SI.
+
+ for (unsigned Op : {G_LOAD, G_STORE}) {
+ const bool IsStore = Op == G_STORE;
+
+ auto &Actions = getActionDefinitionsBuilder(Op);
+ // Whitelist the common cases.
+ // TODO: Pointer loads
+ // TODO: Wide constant loads
+ // TODO: Only CI+ has 3x loads
+ // TODO: Loads to s16 on gfx9
+ Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
+ {V2S32, GlobalPtr, 64, GlobalAlign32},
+ {V3S32, GlobalPtr, 96, GlobalAlign32},
+ {S96, GlobalPtr, 96, GlobalAlign32},
+ {V4S32, GlobalPtr, 128, GlobalAlign32},
+ {S128, GlobalPtr, 128, GlobalAlign32},
+ {S64, GlobalPtr, 64, GlobalAlign32},
+ {V2S64, GlobalPtr, 128, GlobalAlign32},
+ {V2S16, GlobalPtr, 32, GlobalAlign32},
+ {S32, GlobalPtr, 8, GlobalAlign8},
+ {S32, GlobalPtr, 16, GlobalAlign16},
+
+ {S32, LocalPtr, 32, 32},
+ {S64, LocalPtr, 64, 32},
+ {V2S32, LocalPtr, 64, 32},
+ {S32, LocalPtr, 8, 8},
+ {S32, LocalPtr, 16, 16},
+ {V2S16, LocalPtr, 32, 32},
+
+ {S32, PrivatePtr, 32, 32},
+ {S32, PrivatePtr, 8, 8},
+ {S32, PrivatePtr, 16, 16},
+ {V2S16, PrivatePtr, 32, 32},
+
+ {S32, FlatPtr, 32, GlobalAlign32},
+ {S32, FlatPtr, 16, GlobalAlign16},
+ {S32, FlatPtr, 8, GlobalAlign8},
+ {V2S16, FlatPtr, 32, GlobalAlign32},
+
+ {S32, ConstantPtr, 32, GlobalAlign32},
+ {V2S32, ConstantPtr, 64, GlobalAlign32},
+ {V3S32, ConstantPtr, 96, GlobalAlign32},
+ {V4S32, ConstantPtr, 128, GlobalAlign32},
+ {S64, ConstantPtr, 64, GlobalAlign32},
+ {S128, ConstantPtr, 128, GlobalAlign32},
+ {V2S32, ConstantPtr, 32, GlobalAlign32}});
+ Actions
+ .customIf(typeIs(1, Constant32Ptr))
+ .narrowScalarIf(
+ [=](const LegalityQuery &Query) -> bool {
+ return !Query.Types[0].isVector() && needToSplitLoad(Query);
+ },
+ [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
+ const LLT DstTy = Query.Types[0];
+ const LLT PtrTy = Query.Types[1];
+
+ const unsigned DstSize = DstTy.getSizeInBits();
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+
+ // Split extloads.
+ if (DstSize > MemSize)
+ return std::make_pair(0, LLT::scalar(MemSize));
+
+ if (DstSize > 32 && (DstSize % 32 != 0)) {
+ // FIXME: Need a way to specify non-extload of larger size if
+ // suitably aligned.
+ return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
+ }
+
+ unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
+ if (MemSize > MaxSize)
+ return std::make_pair(0, LLT::scalar(MaxSize));
+
+ unsigned Align = Query.MMODescrs[0].AlignInBits;
+ return std::make_pair(0, LLT::scalar(Align));
+ })
+ .fewerElementsIf(
+ [=](const LegalityQuery &Query) -> bool {
+ return Query.Types[0].isVector() && needToSplitLoad(Query);
+ },
+ [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
+ const LLT DstTy = Query.Types[0];
+ const LLT PtrTy = Query.Types[1];
+
+ LLT EltTy = DstTy.getElementType();
+ unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
+
+ // Split if it's too large for the address space.
+ if (Query.MMODescrs[0].SizeInBits > MaxSize) {
+ unsigned NumElts = DstTy.getNumElements();
+ unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
+
+ // FIXME: Refine when odd breakdowns handled
+ // The scalars will need to be re-legalized.
+ if (NumPieces == 1 || NumPieces >= NumElts ||
+ NumElts % NumPieces != 0)
+ return std::make_pair(0, EltTy);
+
+ return std::make_pair(0,
+ LLT::vector(NumElts / NumPieces, EltTy));
+ }
+
+ // Need to split because of alignment.
+ unsigned Align = Query.MMODescrs[0].AlignInBits;
+ unsigned EltSize = EltTy.getSizeInBits();
+ if (EltSize > Align &&
+ (EltSize / Align < DstTy.getNumElements())) {
+ return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
+ }
+
+ // May need relegalization for the scalars.
+ return std::make_pair(0, EltTy);
+ })
+ .minScalar(0, S32);
+
+ if (IsStore)
+ Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
+
+ // TODO: Need a bitcast lower option?
+ Actions
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT Ty0 = Query.Types[0];
+ unsigned Size = Ty0.getSizeInBits();
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ unsigned Align = Query.MMODescrs[0].AlignInBits;
+
+ // No extending vector loads.
+ if (Size > MemSize && Ty0.isVector())
+ return false;
+
+ // FIXME: Widening store from alignment not valid.
+ if (MemSize < Size)
+ MemSize = std::max(MemSize, Align);
+
+ switch (MemSize) {
+ case 8:
+ case 16:
+ return Size == 32;
+ case 32:
+ case 64:
+ case 128:
+ return true;
+ case 96:
+ return ST.hasDwordx3LoadStores();
+ case 256:
+ case 512:
+ return true;
+ default:
+ return false;
+ }
+ })
+ .widenScalarToNextPow2(0)
+ // TODO: v3s32->v4s32 with alignment
+ .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
+ }
- // FIXME: Handle alignment requirements.
auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
- .legalForTypesWithMemDesc({
- {S32, GlobalPtr, 8, 8},
- {S32, GlobalPtr, 16, 8},
- {S32, LocalPtr, 8, 8},
- {S32, LocalPtr, 16, 8},
- {S32, PrivatePtr, 8, 8},
- {S32, PrivatePtr, 16, 8}});
+ .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
+ {S32, GlobalPtr, 16, 2 * 8},
+ {S32, LocalPtr, 8, 8},
+ {S32, LocalPtr, 16, 16},
+ {S32, PrivatePtr, 8, 8},
+ {S32, PrivatePtr, 16, 16},
+ {S32, ConstantPtr, 8, 8},
+ {S32, ConstantPtr, 16, 2 * 8}});
if (ST.hasFlatAddressSpace()) {
- ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
- {S32, FlatPtr, 16, 8}});
+ ExtLoads.legalForTypesWithMemDesc(
+ {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
}
ExtLoads.clampScalar(0, S32, S32)
@@ -590,6 +855,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
+ getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
+ .legalFor({{S32, LocalPtr}});
+
+ getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
+ .lower();
+
// TODO: Pointer types, any 32-bit or 64-bit vector
getActionDefinitionsBuilder(G_SELECT)
.legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
@@ -643,7 +914,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return (EltTy.getSizeInBits() == 16 ||
EltTy.getSizeInBits() % 32 == 0) &&
VecTy.getSizeInBits() % 32 == 0 &&
- VecTy.getSizeInBits() <= 512 &&
+ VecTy.getSizeInBits() <= 1024 &&
IdxTy.getSizeInBits() == 32;
})
.clampScalar(EltTypeIdx, S32, S64)
@@ -663,6 +934,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Doesn't handle extract of illegal sizes.
getActionDefinitionsBuilder(Op)
+ .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
+ // FIXME: Multiples of 16 should not be legal.
.legalIf([=](const LegalityQuery &Query) {
const LLT BigTy = Query.Types[BigTyIdx];
const LLT LitTy = Query.Types[LitTyIdx];
@@ -686,18 +959,36 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}
- getActionDefinitionsBuilder(G_BUILD_VECTOR)
- .legalForCartesianProduct(AllS32Vectors, {S32})
- .legalForCartesianProduct(AllS64Vectors, {S64})
- .clampNumElements(0, V16S32, V16S32)
- .clampNumElements(0, V2S64, V8S64)
- .minScalarSameAs(1, 0)
- .legalIf(isRegisterType(0))
- .minScalarOrElt(0, S32);
+ auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
+ .legalForCartesianProduct(AllS32Vectors, {S32})
+ .legalForCartesianProduct(AllS64Vectors, {S64})
+ .clampNumElements(0, V16S32, V32S32)
+ .clampNumElements(0, V2S64, V16S64)
+ .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
+
+ if (ST.hasScalarPackInsts())
+ BuildVector.legalFor({V2S16, S32});
+
+ BuildVector
+ .minScalarSameAs(1, 0)
+ .legalIf(isRegisterType(0))
+ .minScalarOrElt(0, S32);
+
+ if (ST.hasScalarPackInsts()) {
+ getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
+ .legalFor({V2S16, S32})
+ .lower();
+ } else {
+ getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
+ .lower();
+ }
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
.legalIf(isRegisterType(0));
+ // TODO: Don't fully scalarize v2s16 pieces
+ getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
+
// Merge/Unmerge
for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
@@ -715,14 +1006,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return false;
};
- getActionDefinitionsBuilder(Op)
+ auto &Builder = getActionDefinitionsBuilder(Op)
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
// Clamp the little scalar to s8-s256 and make it a power of 2. It's not
// worth considering the multiples of 64 since 2*192 and 2*384 are not
// valid.
.clampScalar(LitTyIdx, S16, S256)
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
-
+ .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
+ .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
+ elementTypeIs(1, S16)),
+ changeTo(1, V2S16))
// Break up vectors with weird elements into scalars
.fewerElementsIf(
[=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
@@ -730,25 +1024,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.fewerElementsIf(
[=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
scalarize(1))
- .clampScalar(BigTyIdx, S32, S512)
- .widenScalarIf(
+ .clampScalar(BigTyIdx, S32, S1024)
+ .lowerFor({{S16, V2S16}});
+
+ if (Op == G_MERGE_VALUES) {
+ Builder.widenScalarIf(
+ // TODO: Use 16-bit shifts if legal for 8-bit values?
[=](const LegalityQuery &Query) {
- const LLT &Ty = Query.Types[BigTyIdx];
- return !isPowerOf2_32(Ty.getSizeInBits()) &&
- Ty.getSizeInBits() % 16 != 0;
+ const LLT Ty = Query.Types[LitTyIdx];
+ return Ty.getSizeInBits() < 32;
},
- [=](const LegalityQuery &Query) {
- // Pick the next power of 2, or a multiple of 64 over 128.
- // Whichever is smaller.
- const LLT &Ty = Query.Types[BigTyIdx];
- unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
- if (NewSizeInBits >= 256) {
- unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
- if (RoundedTo < NewSizeInBits)
- NewSizeInBits = RoundedTo;
- }
- return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
- })
+ changeTo(LitTyIdx, S32));
+ }
+
+ Builder.widenScalarIf(
+ [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[BigTyIdx];
+ return !isPowerOf2_32(Ty.getSizeInBits()) &&
+ Ty.getSizeInBits() % 16 != 0;
+ },
+ [=](const LegalityQuery &Query) {
+ // Pick the next power of 2, or a multiple of 64 over 128.
+ // Whichever is smaller.
+ const LLT &Ty = Query.Types[BigTyIdx];
+ unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
+ if (NewSizeInBits >= 256) {
+ unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
+ if (RoundedTo < NewSizeInBits)
+ NewSizeInBits = RoundedTo;
+ }
+ return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
+ })
.legalIf([=](const LegalityQuery &Query) {
const LLT &BigTy = Query.Types[BigTyIdx];
const LLT &LitTy = Query.Types[LitTyIdx];
@@ -760,43 +1066,56 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return BigTy.getSizeInBits() % 16 == 0 &&
LitTy.getSizeInBits() % 16 == 0 &&
- BigTy.getSizeInBits() <= 512;
+ BigTy.getSizeInBits() <= 1024;
})
// Any vectors left are the wrong size. Scalarize them.
.scalarize(0)
.scalarize(1);
}
+ getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+
computeTables();
verify(*ST.getInstrInfo());
}
bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder,
+ MachineIRBuilder &B,
GISelChangeObserver &Observer) const {
switch (MI.getOpcode()) {
case TargetOpcode::G_ADDRSPACE_CAST:
- return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
+ return legalizeAddrSpaceCast(MI, MRI, B);
case TargetOpcode::G_FRINT:
- return legalizeFrint(MI, MRI, MIRBuilder);
+ return legalizeFrint(MI, MRI, B);
case TargetOpcode::G_FCEIL:
- return legalizeFceil(MI, MRI, MIRBuilder);
+ return legalizeFceil(MI, MRI, B);
case TargetOpcode::G_INTRINSIC_TRUNC:
- return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
+ return legalizeIntrinsicTrunc(MI, MRI, B);
case TargetOpcode::G_SITOFP:
- return legalizeITOFP(MI, MRI, MIRBuilder, true);
+ return legalizeITOFP(MI, MRI, B, true);
case TargetOpcode::G_UITOFP:
- return legalizeITOFP(MI, MRI, MIRBuilder, false);
+ return legalizeITOFP(MI, MRI, B, false);
case TargetOpcode::G_FMINNUM:
case TargetOpcode::G_FMAXNUM:
case TargetOpcode::G_FMINNUM_IEEE:
case TargetOpcode::G_FMAXNUM_IEEE:
- return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
+ return legalizeMinNumMaxNum(MI, MRI, B);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
- return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
+ return legalizeExtractVectorElt(MI, MRI, B);
case TargetOpcode::G_INSERT_VECTOR_ELT:
- return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
+ return legalizeInsertVectorElt(MI, MRI, B);
+ case TargetOpcode::G_FSIN:
+ case TargetOpcode::G_FCOS:
+ return legalizeSinCos(MI, MRI, B);
+ case TargetOpcode::G_GLOBAL_VALUE:
+ return legalizeGlobalValue(MI, MRI, B);
+ case TargetOpcode::G_LOAD:
+ return legalizeLoad(MI, MRI, B, Observer);
+ case TargetOpcode::G_FMAD:
+ return legalizeFMad(MI, MRI, B);
+ case TargetOpcode::G_FDIV:
+ return legalizeFDIV(MI, MRI, B);
default:
return false;
}
@@ -807,11 +1126,13 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
Register AMDGPULegalizerInfo::getSegmentAperture(
unsigned AS,
MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const {
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineIRBuilder &B) const {
+ MachineFunction &MF = B.getMF();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const LLT S32 = LLT::scalar(32);
+ assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
+
if (ST.hasApertureRegs()) {
// FIXME: Use inline constants (src_{shared, private}_base) instead of
// getreg.
@@ -829,13 +1150,13 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
Register ApertureReg = MRI.createGenericVirtualRegister(S32);
Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
+ B.buildInstr(AMDGPU::S_GETREG_B32)
.addDef(GetReg)
.addImm(Encoding);
MRI.setType(GetReg, S32);
- auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
- MIRBuilder.buildInstr(TargetOpcode::G_SHL)
+ auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
+ B.buildInstr(TargetOpcode::G_SHL)
.addDef(ApertureReg)
.addUse(GetReg)
.addUse(ShiftAmt.getReg(0));
@@ -846,8 +1167,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
Register QueuePtr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
- // FIXME: Placeholder until we can track the input registers.
- MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
+ return Register();
// Offset into amd_queue_t for group_segment_aperture_base_hi /
// private_segment_aperture_base_hi.
@@ -870,18 +1192,19 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
Register LoadResult = MRI.createGenericVirtualRegister(S32);
Register LoadAddr;
- MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
- MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
+ B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
+ B.buildLoad(LoadResult, LoadAddr, *MMO);
return LoadResult;
}
bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const {
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineIRBuilder &B) const {
+ MachineFunction &MF = B.getMF();
- MIRBuilder.setInstr(MI);
+ B.setInstr(MI);
+ const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
@@ -899,7 +1222,28 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
- MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
+ MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
+ return true;
+ }
+
+ if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ // Truncate.
+ B.buildExtract(Dst, Src, 0);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ uint32_t AddrHiVal = Info->get32BitAddressHighBits();
+
+ // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
+ // another. Merge operands are required to be the same type, but creating an
+ // extra ptrtoint would be kind of pointless.
+ auto HighAddr = B.buildConstant(
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
+ B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
+ MI.eraseFromParent();
return true;
}
@@ -908,47 +1252,52 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
DestAS == AMDGPUAS::PRIVATE_ADDRESS);
unsigned NullVal = TM.getNullPointerValue(DestAS);
- auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
- auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
+ auto SegmentNull = B.buildConstant(DstTy, NullVal);
+ auto FlatNull = B.buildConstant(SrcTy, 0);
Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
// Extract low 32-bits of the pointer.
- MIRBuilder.buildExtract(PtrLo32, Src, 0);
+ B.buildExtract(PtrLo32, Src, 0);
Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
- MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
- MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
+ B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
+ B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
MI.eraseFromParent();
return true;
}
- assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
+ if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
+ return false;
+
+ if (!ST.hasFlatAddressSpace())
+ return false;
auto SegmentNull =
- MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
+ B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
auto FlatNull =
- MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
+ B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
- Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
+ Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
+ if (!ApertureReg.isValid())
+ return false;
Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
- MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
+ B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
// Coerce the type of the low half of the result so we can use merge_values.
- Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
- MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
+ Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
+ B.buildInstr(TargetOpcode::G_PTRTOINT)
.addDef(SrcAsInt)
.addUse(Src);
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
- MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
- MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
+ B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
+ B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
MI.eraseFromParent();
return true;
@@ -956,8 +1305,8 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
bool AMDGPULegalizerInfo::legalizeFrint(
MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const {
- MIRBuilder.setInstr(MI);
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
Register Src = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(Src);
@@ -966,18 +1315,18 @@ bool AMDGPULegalizerInfo::legalizeFrint(
APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
- auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
- auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
+ auto C1 = B.buildFConstant(Ty, C1Val);
+ auto CopySign = B.buildFCopysign(Ty, C1, Src);
// TODO: Should this propagate fast-math-flags?
- auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
- auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
+ auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
+ auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
- auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
- auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
+ auto C2 = B.buildFConstant(Ty, C2Val);
+ auto Fabs = B.buildFAbs(Ty, Src);
- auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
- MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
+ auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
+ B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
return true;
}
@@ -1124,7 +1473,7 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
MachineIRBuilder HelperBuilder(MI);
GISelObserverWrapper DummyObserver;
LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
- HelperBuilder.setMBB(*MI.getParent());
+ HelperBuilder.setInstr(MI);
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
}
@@ -1187,6 +1536,194 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
return true;
}
+bool AMDGPULegalizerInfo::legalizeSinCos(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(DstReg);
+ unsigned Flags = MI.getFlags();
+
+ Register TrigVal;
+ auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
+ if (ST.hasTrigReducedRange()) {
+ auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
+ TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
+ .addUse(MulVal.getReg(0))
+ .setMIFlags(Flags).getReg(0);
+ } else
+ TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
+
+ Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
+ Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
+ B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
+ .addUse(TrigVal)
+ .setMIFlags(Flags);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
+ Register DstReg, LLT PtrTy,
+ MachineIRBuilder &B, const GlobalValue *GV,
+ unsigned Offset, unsigned GAFlags) const {
+ // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
+ // to the following code sequence:
+ //
+ // For constant address space:
+ // s_getpc_b64 s[0:1]
+ // s_add_u32 s0, s0, $symbol
+ // s_addc_u32 s1, s1, 0
+ //
+ // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+ // a fixup or relocation is emitted to replace $symbol with a literal
+ // constant, which is a pc-relative offset from the encoding of the $symbol
+ // operand to the global variable.
+ //
+ // For global address space:
+ // s_getpc_b64 s[0:1]
+ // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
+ // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
+ //
+ // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+ // fixups or relocations are emitted to replace $symbol@*@lo and
+ // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
+ // which is a 64-bit pc-relative offset from the encoding of the $symbol
+ // operand to the global variable.
+ //
+ // What we want here is an offset from the value returned by s_getpc
+ // (which is the address of the s_add_u32 instruction) to the global
+ // variable, but since the encoding of $symbol starts 4 bytes after the start
+ // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
+ // small. This requires us to add 4 to the global variable offset in order to
+ // compute the correct address.
+
+ LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+
+ Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
+ B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
+
+ MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
+ .addDef(PCReg);
+
+ MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
+ if (GAFlags == SIInstrInfo::MO_NONE)
+ MIB.addImm(0);
+ else
+ MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
+
+ B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
+
+ if (PtrTy.getSizeInBits() == 32)
+ B.buildExtract(DstReg, PCReg, 0);
+ return true;
+ }
+
+bool AMDGPULegalizerInfo::legalizeGlobalValue(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT Ty = MRI.getType(DstReg);
+ unsigned AS = Ty.getAddressSpace();
+
+ const GlobalValue *GV = MI.getOperand(1).getGlobal();
+ MachineFunction &MF = B.getMF();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ B.setInstr(MI);
+
+ if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
+ if (!MFI->isEntryFunction()) {
+ const Function &Fn = MF.getFunction();
+ DiagnosticInfoUnsupported BadLDSDecl(
+ Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
+ Fn.getContext().diagnose(BadLDSDecl);
+ }
+
+ // TODO: We could emit code to handle the initialization somewhere.
+ if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
+ B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
+ MI.eraseFromParent();
+ return true;
+ }
+
+ const Function &Fn = MF.getFunction();
+ DiagnosticInfoUnsupported BadInit(
+ Fn, "unsupported initializer for address space", MI.getDebugLoc());
+ Fn.getContext().diagnose(BadInit);
+ return true;
+ }
+
+ const SITargetLowering *TLI = ST.getTargetLowering();
+
+ if (TLI->shouldEmitFixup(GV)) {
+ buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if (TLI->shouldEmitPCReloc(GV)) {
+ buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+ Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
+
+ MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getGOT(MF),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ 8 /*Size*/, 8 /*Align*/);
+
+ buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
+
+ if (Ty.getSizeInBits() == 32) {
+ // Truncate if this is a 32-bit constant adrdess.
+ auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
+ B.buildExtract(DstReg, Load, 0);
+ } else
+ B.buildLoad(DstReg, GOTAddr, *GOTMMO);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeLoad(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, GISelChangeObserver &Observer) const {
+ B.setInstr(MI);
+ LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+ auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
+ Observer.changingInstr(MI);
+ MI.getOperand(1).setReg(Cast.getReg(0));
+ Observer.changedInstr(MI);
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFMad(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ assert(Ty.isScalar());
+
+ // TODO: Always legal with future ftz flag.
+ if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
+ return true;
+ if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
+ return true;
+
+ MachineFunction &MF = B.getMF();
+
+ MachineIRBuilder HelperBuilder(MI);
+ GISelObserverWrapper DummyObserver;
+ LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
+ HelperBuilder.setMBB(*MI.getParent());
+ return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
+}
+
// Return the use branch instruction, otherwise null if the usage is invalid.
static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI) {
@@ -1212,10 +1749,9 @@ Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
const ArgDescriptor *Arg) const {
- if (!Arg->isRegister())
+ if (!Arg->isRegister() || !Arg->getRegister().isValid())
return false; // TODO: Handle these
- assert(Arg->getRegister() != 0);
assert(Arg->getRegister().isPhysical());
MachineRegisterInfo &MRI = *B.getMRI();
@@ -1229,19 +1765,30 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
const unsigned Mask = Arg->getMask();
const unsigned Shift = countTrailingZeros<unsigned>(Mask);
- auto ShiftAmt = B.buildConstant(S32, Shift);
- auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
- B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
+ Register AndMaskSrc = LiveIn;
+
+ if (Shift != 0) {
+ auto ShiftAmt = B.buildConstant(S32, Shift);
+ AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
+ }
+
+ B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
} else
B.buildCopy(DstReg, LiveIn);
// Insert the argument copy if it doens't already exist.
// FIXME: It seems EmitLiveInCopies isn't called anywhere?
if (!MRI.getVRegDef(LiveIn)) {
+ // FIXME: Should have scoped insert pt
+ MachineBasicBlock &OrigInsBB = B.getMBB();
+ auto OrigInsPt = B.getInsertPt();
+
MachineBasicBlock &EntryMBB = B.getMF().front();
EntryMBB.addLiveIn(Arg->getRegister());
B.setInsertPt(EntryMBB, EntryMBB.begin());
B.buildCopy(LiveIn, Arg->getRegister());
+
+ B.setInsertPt(OrigInsBB, OrigInsPt);
}
return true;
@@ -1272,6 +1819,113 @@ bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
return false;
}
+bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
+
+ if (legalizeFastUnsafeFDIV(MI, MRI, B))
+ return true;
+
+ return false;
+}
+
+bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Register Res = MI.getOperand(0).getReg();
+ Register LHS = MI.getOperand(1).getReg();
+ Register RHS = MI.getOperand(2).getReg();
+
+ uint16_t Flags = MI.getFlags();
+
+ LLT ResTy = MRI.getType(Res);
+ LLT S32 = LLT::scalar(32);
+ LLT S64 = LLT::scalar(64);
+
+ const MachineFunction &MF = B.getMF();
+ bool Unsafe =
+ MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
+
+ if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
+ return false;
+
+ if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals())
+ return false;
+
+ if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
+ // 1 / x -> RCP(x)
+ if (CLHS->isExactlyValue(1.0)) {
+ B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
+ .addUse(RHS)
+ .setMIFlags(Flags);
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // -1 / x -> RCP( FNEG(x) )
+ if (CLHS->isExactlyValue(-1.0)) {
+ auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
+ B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
+ .addUse(FNeg.getReg(0))
+ .setMIFlags(Flags);
+
+ MI.eraseFromParent();
+ return true;
+ }
+ }
+
+ // x / y -> x * (1.0 / y)
+ if (Unsafe) {
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+ .addUse(RHS)
+ .setMIFlags(Flags);
+ B.buildFMul(Res, LHS, RCP, Flags);
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
+ Register Res = MI.getOperand(0).getReg();
+ Register LHS = MI.getOperand(2).getReg();
+ Register RHS = MI.getOperand(3).getReg();
+ uint16_t Flags = MI.getFlags();
+
+ LLT S32 = LLT::scalar(32);
+ LLT S1 = LLT::scalar(1);
+
+ auto Abs = B.buildFAbs(S32, RHS, Flags);
+ const APFloat C0Val(1.0f);
+
+ auto C0 = B.buildConstant(S32, 0x6f800000);
+ auto C1 = B.buildConstant(S32, 0x2f800000);
+ auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
+
+ auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
+ auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
+
+ auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
+
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
+ .addUse(Mul0.getReg(0))
+ .setMIFlags(Flags);
+
+ auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
+
+ B.buildFMul(Res, Sel, Mul1, Flags);
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -1306,11 +1960,79 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ unsigned AddrSpace) const {
+ B.setInstr(MI);
+ Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
+ auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
+ B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
+ MI.eraseFromParent();
+ return true;
+}
+
+/// Handle register layout difference for f16 images for some subtargets.
+Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
+ MachineRegisterInfo &MRI,
+ Register Reg) const {
+ if (!ST.hasUnpackedD16VMem())
+ return Reg;
+
+ const LLT S16 = LLT::scalar(16);
+ const LLT S32 = LLT::scalar(32);
+ LLT StoreVT = MRI.getType(Reg);
+ assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
+
+ auto Unmerge = B.buildUnmerge(S16, Reg);
+
+ SmallVector<Register, 4> WideRegs;
+ for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+ WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
+
+ int NumElts = StoreVT.getNumElements();
+
+ return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+}
+
+bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ bool IsFormat) const {
+ // TODO: Reject f16 format on targets where unsupported.
+ Register VData = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(VData);
+
+ B.setInstr(MI);
+
+ const LLT S32 = LLT::scalar(32);
+ const LLT S16 = LLT::scalar(16);
+
+ // Fixup illegal register types for i8 stores.
+ if (Ty == LLT::scalar(8) || Ty == S16) {
+ Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
+ MI.getOperand(1).setReg(AnyExt);
+ return true;
+ }
+
+ if (Ty.isVector()) {
+ if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
+ if (IsFormat)
+ MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
+ return true;
+ }
+
+ return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
+ }
+
+ return Ty == S32;
+}
+
bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
// Replace the use G_BRCOND with the exec manipulate and branch pseudos.
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_if: {
if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
const SIRegisterInfo *TRI
@@ -1386,6 +2108,22 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
case Intrinsic::amdgcn_dispatch_id:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::DISPATCH_ID);
+ case Intrinsic::amdgcn_fdiv_fast:
+ return legalizeFDIVFastIntrin(MI, MRI, B);
+ case Intrinsic::amdgcn_is_shared:
+ return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
+ case Intrinsic::amdgcn_is_private:
+ return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
+ case Intrinsic::amdgcn_wavefrontsize: {
+ B.setInstr(MI);
+ B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
+ MI.eraseFromParent();
+ return true;
+ }
+ case Intrinsic::amdgcn_raw_buffer_store:
+ return legalizeRawBufferStore(MI, MRI, B, false);
+ case Intrinsic::amdgcn_raw_buffer_store_format:
+ return legalizeRawBufferStore(MI, MRI, B, true);
default:
return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 3f1cc1d265dd..d0fba23a8686 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -16,6 +16,7 @@
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "AMDGPUArgumentUsageInfo.h"
+#include "SIInstrInfo.h"
namespace llvm {
@@ -32,29 +33,44 @@ public:
const GCNTargetMachine &TM);
bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder,
+ MachineIRBuilder &B,
GISelChangeObserver &Observer) const override;
Register getSegmentAperture(unsigned AddrSpace,
MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder, bool Signed) const;
+ MachineIRBuilder &B, bool Signed) const;
bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
+ bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
+ bool buildPCRelGlobalAddress(
+ Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
+ unsigned Offset, unsigned GAFlags = SIInstrInfo::MO_NONE) const;
+
+ bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ GISelChangeObserver &Observer) const;
+
+ bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
Register getLiveInRegister(MachineRegisterInfo &MRI,
Register Reg, LLT Ty) const;
@@ -65,10 +81,24 @@ public:
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+ bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, unsigned AddrSpace) const;
+
+ Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ Register Reg) const;
+ bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, bool IsFormat) const;
bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const override;
+ MachineIRBuilder &B) const override;
};
} // End llvm namespace.
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index ce0a9db7c7f4..2c94e0046651 100644
--- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -30,6 +30,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
@@ -48,18 +49,10 @@ static cl::list<std::string> UseNative("amdgpu-use-native",
cl::CommaSeparated, cl::ValueOptional,
cl::Hidden);
-#define MATH_PI 3.14159265358979323846264338327950288419716939937511
-#define MATH_E 2.71828182845904523536028747135266249775724709369996
-#define MATH_SQRT2 1.41421356237309504880168872420969807856967187537695
-
-#define MATH_LOG2E 1.4426950408889634073599246810018921374266459541529859
-#define MATH_LOG10E 0.4342944819032518276511289189166050822943970058036665
-// Value of log2(10)
-#define MATH_LOG2_10 3.3219280948873623478703194294893901758648313930245806
-// Value of 1 / log2(10)
-#define MATH_RLOG2_10 0.3010299956639811952137388947244930267681898814621085
-// Value of 1 / M_LOG2E_F = 1 / log2(e)
-#define MATH_RLOG2_E 0.6931471805599453094172321214581765680755001343602552
+#define MATH_PI numbers::pi
+#define MATH_E numbers::e
+#define MATH_SQRT2 numbers::sqrt2
+#define MATH_SQRT1_2 numbers::inv_sqrt2
namespace llvm {
@@ -254,8 +247,8 @@ struct TableEntry {
/* a list of {result, input} */
static const TableEntry tbl_acos[] = {
- {MATH_PI/2.0, 0.0},
- {MATH_PI/2.0, -0.0},
+ {MATH_PI / 2.0, 0.0},
+ {MATH_PI / 2.0, -0.0},
{0.0, 1.0},
{MATH_PI, -1.0}
};
@@ -271,8 +264,8 @@ static const TableEntry tbl_acospi[] = {
static const TableEntry tbl_asin[] = {
{0.0, 0.0},
{-0.0, -0.0},
- {MATH_PI/2.0, 1.0},
- {-MATH_PI/2.0, -1.0}
+ {MATH_PI / 2.0, 1.0},
+ {-MATH_PI / 2.0, -1.0}
};
static const TableEntry tbl_asinh[] = {
{0.0, 0.0},
@@ -287,8 +280,8 @@ static const TableEntry tbl_asinpi[] = {
static const TableEntry tbl_atan[] = {
{0.0, 0.0},
{-0.0, -0.0},
- {MATH_PI/4.0, 1.0},
- {-MATH_PI/4.0, -1.0}
+ {MATH_PI / 4.0, 1.0},
+ {-MATH_PI / 4.0, -1.0}
};
static const TableEntry tbl_atanh[] = {
{0.0, 0.0},
@@ -359,7 +352,7 @@ static const TableEntry tbl_log10[] = {
};
static const TableEntry tbl_rsqrt[] = {
{1.0, 1.0},
- {1.0/MATH_SQRT2, 2.0}
+ {MATH_SQRT1_2, 2.0}
};
static const TableEntry tbl_sin[] = {
{0.0, 0.0},
@@ -868,7 +861,7 @@ static double log2(double V) {
#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
return ::log2(V);
#else
- return log(V) / 0.693147180559945309417;
+ return log(V) / numbers::ln2;
#endif
}
}
@@ -1430,8 +1423,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
B.SetInsertPoint(&*ItNew);
AllocaInst *Alloc = B.CreateAlloca(RetType, 0,
std::string(prefix) + UI->getName());
- Alloc->setAlignment(UCallee->getParent()->getDataLayout()
- .getTypeAllocSize(RetType));
+ Alloc->setAlignment(MaybeAlign(
+ UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
return Alloc;
}
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index a5bac25701a0..e1ae496d9cbc 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -55,7 +55,7 @@ enum EManglingParam {
};
struct ManglingRule {
- StringRef const Name;
+ const char *Name;
unsigned char Lead[2];
unsigned char Param[5];
@@ -69,7 +69,7 @@ struct ManglingRule {
// Information about library functions with unmangled names.
class UnmangledFuncInfo {
- StringRef const Name;
+ const char *Name;
unsigned NumArgs;
// Table for all lib functions with unmangled names.
@@ -82,7 +82,7 @@ class UnmangledFuncInfo {
public:
using ID = AMDGPULibFunc::EFuncId;
- UnmangledFuncInfo(StringRef _Name, unsigned _NumArgs)
+ constexpr UnmangledFuncInfo(const char *_Name, unsigned _NumArgs)
: Name(_Name), NumArgs(_NumArgs) {}
// Get index to Table by function name.
static bool lookup(StringRef Name, ID &Id);
@@ -133,8 +133,8 @@ unsigned ManglingRule::getNumArgs() const {
// E_ANY - use prev lead type, E_CONSTPTR_ANY - make const pointer out of
// prev lead type, etc. see ParamIterator::getNextParam() for details.
-static const ManglingRule manglingRules[] = {
-{ StringRef(), {0}, {0} },
+static constexpr ManglingRule manglingRules[] = {
+{ "", {0}, {0} },
{ "abs" , {1}, {E_ANY}},
{ "abs_diff" , {1}, {E_ANY,E_COPY}},
{ "acos" , {1}, {E_ANY}},
@@ -682,9 +682,9 @@ bool AMDGPULibFunc::parse(StringRef FuncName, AMDGPULibFunc &F) {
}
if (eatTerm(FuncName, "_Z"))
- F.Impl = make_unique<AMDGPUMangledLibFunc>();
+ F.Impl = std::make_unique<AMDGPUMangledLibFunc>();
else
- F.Impl = make_unique<AMDGPUUnmangledLibFunc>();
+ F.Impl = std::make_unique<AMDGPUUnmangledLibFunc>();
if (F.Impl->parseFuncName(FuncName))
return true;
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 5dd5b3691e0a..e64542a395f0 100644
--- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -72,10 +72,10 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
BasicBlock &EntryBlock = *F.begin();
IRBuilder<> Builder(&*EntryBlock.begin());
- const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
+ const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
- unsigned MaxAlign;
+ Align MaxAlign;
// FIXME: Alignment is broken broken with explicit arg offset.;
const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
if (TotalKernArgSize == 0)
@@ -94,12 +94,12 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
for (Argument &Arg : F.args()) {
Type *ArgTy = Arg.getType();
- unsigned Align = DL.getABITypeAlignment(ArgTy);
+ unsigned ABITypeAlign = DL.getABITypeAlignment(ArgTy);
unsigned Size = DL.getTypeSizeInBits(ArgTy);
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
- uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
- ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+ uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
if (Arg.use_empty())
continue;
@@ -128,8 +128,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
int64_t AlignDownOffset = alignDown(EltOffset, 4);
int64_t OffsetDiff = EltOffset - AlignDownOffset;
- unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset,
- KernArgBaseAlign);
+ Align AdjustedAlign = commonAlignment(
+ KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
Value *ArgPtr;
Type *AdjustedArgTy;
@@ -160,7 +160,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
ArgPtr->getName() + ".cast");
LoadInst *Load =
- Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
+ Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign.value());
Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
MDBuilder MDB(Ctx);
@@ -220,8 +220,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
}
KernArgSegment->addAttribute(
- AttributeList::ReturnIndex,
- Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
+ AttributeList::ReturnIndex,
+ Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index ae4c32c258a7..3760aed87a43 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -211,6 +211,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
lowerOperand(MO, MCOp);
OutMI.addOperand(MCOp);
}
+
+ int FIIdx = AMDGPU::getNamedOperandIdx(MCOpcode, AMDGPU::OpName::fi);
+ if (FIIdx >= (int)OutMI.getNumOperands())
+ OutMI.addOperand(MCOperand::createImm(0));
}
bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 237490957058..ba72f71f4322 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -694,7 +694,7 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
- if (TRI->isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
<< "\n");
// If this is a source register to a PHI we are chaining, it
@@ -734,7 +734,7 @@ void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
- if (TRI->isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
<< "\n");
for (auto &UI : MRI->use_operands(Reg)) {
@@ -949,7 +949,7 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
(IncludeLoopPHI && IsLoopPHI);
if (ShouldReplace) {
- if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+ if (Register::isPhysicalRegister(NewRegister)) {
LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
<< printReg(NewRegister, MRI->getTargetRegisterInfo())
<< "\n");
@@ -1016,13 +1016,15 @@ bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) {
// before are no longer register kills.
void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+ (void)TRI; // It's used by LLVM_DEBUG.
+
for (auto MBBI : MBBs) {
MachineBasicBlock *MBB = MBBI;
for (auto &II : *MBB) {
for (auto &RI : II.uses()) {
if (RI.isReg()) {
- unsigned Reg = RI.getReg();
- if (TRI->isVirtualRegister(Reg)) {
+ Register Reg = RI.getReg();
+ if (Register::isVirtualRegister(Reg)) {
if (hasNoDef(Reg, MRI))
continue;
if (!MRI->hasOneDef(Reg)) {
@@ -1402,7 +1404,7 @@ void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest(
unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo(
MachineInstr &PHI, SmallVector<unsigned, 2> *RegionIndices) {
unsigned DestReg = getPHIDestReg(PHI);
- unsigned LinearizeDestReg =
+ Register LinearizeDestReg =
MRI->createVirtualRegister(MRI->getRegClass(DestReg));
PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc());
storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices);
@@ -1890,7 +1892,7 @@ void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled(
if (!Cond[0].isReg())
return;
- unsigned CondReg = Cond[0].getReg();
+ Register CondReg = Cond[0].getReg();
for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) {
(*UI).setIsKill(false);
}
@@ -1929,8 +1931,8 @@ void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *Co
BBSelectReg, TrueBB->getNumber());
} else {
const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg);
- unsigned TrueBBReg = MRI->createVirtualRegister(RegClass);
- unsigned FalseBBReg = MRI->createVirtualRegister(RegClass);
+ Register TrueBBReg = MRI->createVirtualRegister(RegClass);
+ Register FalseBBReg = MRI->createVirtualRegister(RegClass);
TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
TrueBBReg, TrueBB->getNumber());
TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
@@ -1996,7 +1998,7 @@ void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB,
InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI);
}
const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg);
- unsigned NextDestReg = MRI->createVirtualRegister(RegClass);
+ Register NextDestReg = MRI->createVirtualRegister(RegClass);
bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1;
LLVM_DEBUG(dbgs() << "Insert Chained PHI\n");
insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg,
@@ -2056,8 +2058,8 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
// register, unless it is the outgoing BB select register. We have
// already creaed phi nodes for these.
const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
- unsigned PHIDestReg = MRI->createVirtualRegister(RegClass);
- unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
+ Register PHIDestReg = MRI->createVirtualRegister(RegClass);
+ Register IfSourceReg = MRI->createVirtualRegister(RegClass);
// Create initializer, this value is never used, but is needed
// to satisfy SSA.
LLVM_DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n");
@@ -2172,7 +2174,7 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent();
const TargetRegisterClass *RegClass =
MRI->getRegClass(CurrentBackedgeReg);
- unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass);
+ Register NewBackedgeReg = MRI->createVirtualRegister(RegClass);
MachineInstrBuilder BackedgePHI =
BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL,
TII->get(TargetOpcode::PHI), NewBackedgeReg);
@@ -2230,7 +2232,7 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
I != E;) {
MachineOperand &O = *I;
++I;
- if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+ if (Register::isPhysicalRegister(NewRegister)) {
LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
<< printReg(NewRegister, MRI->getTargetRegisterInfo())
<< "\n");
@@ -2309,7 +2311,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
} else {
// Handle internal block.
const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn);
- unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass);
+ Register CodeBBSelectReg = MRI->createVirtualRegister(RegClass);
rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg);
bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB;
MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB,
@@ -2446,7 +2448,7 @@ void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
}
const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest);
- unsigned NewDestReg = MRI->createVirtualRegister(RegClass);
+ Register NewDestReg = MRI->createVirtualRegister(RegClass);
LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI);
MachineInstrBuilder MIB =
BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
@@ -2734,9 +2736,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
}
const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI());
unsigned InReg = LRegion->getBBSelectRegIn();
- unsigned InnerSelectReg =
+ Register InnerSelectReg =
MRI->createVirtualRegister(MRI->getRegClass(InReg));
- unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg));
+ Register NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg));
TII->materializeImmediate(*(LRegion->getEntry()),
LRegion->getEntry()->getFirstTerminator(), DL,
NewInReg, Region->getEntry()->getNumber());
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 0d3a1f1a769f..89ca702f577d 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -17,7 +17,6 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MachineFunctionInfo(),
LocalMemoryObjects(),
ExplicitKernArgSize(0),
- MaxKernArgAlign(0),
LDSSize(0),
IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 52987e2fa411..9818ab1ef148 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -23,7 +23,7 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
protected:
uint64_t ExplicitKernArgSize; // Cache for this.
- unsigned MaxKernArgAlign; // Cache for this.
+ Align MaxKernArgAlign; // Cache for this.
/// Number of bytes in the LDS that are being used.
unsigned LDSSize;
@@ -47,9 +47,7 @@ public:
return ExplicitKernArgSize;
}
- unsigned getMaxKernArgAlign() const {
- return MaxKernArgAlign;
- }
+ unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); }
unsigned getLDSSize() const {
return LDSSize;
diff --git a/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
new file mode 100644
index 000000000000..5250bf455d71
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -0,0 +1,592 @@
+//=== AMDGPUPrintfRuntimeBinding.cpp - OpenCL printf implementation -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// \file
+//
+// The pass bind printfs to a kernel arg pointer that will be bound to a buffer
+// later by the runtime.
+//
+// This pass traverses the functions in the module and converts
+// each call to printf to a sequence of operations that
+// store the following into the printf buffer:
+// - format string (passed as a module's metadata unique ID)
+// - bitwise copies of printf arguments
+// The backend passes will need to store metadata in the kernel
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "printfToRuntime"
+#define DWORD_ALIGN 4
+
+namespace {
+class LLVM_LIBRARY_VISIBILITY AMDGPUPrintfRuntimeBinding final
+ : public ModulePass {
+
+public:
+ static char ID;
+
+ explicit AMDGPUPrintfRuntimeBinding();
+
+private:
+ bool runOnModule(Module &M) override;
+ void getConversionSpecifiers(SmallVectorImpl<char> &OpConvSpecifiers,
+ StringRef fmt, size_t num_ops) const;
+
+ bool shouldPrintAsStr(char Specifier, Type *OpType) const;
+ bool
+ lowerPrintfForGpu(Module &M,
+ function_ref<const TargetLibraryInfo &(Function &)> GetTLI);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ }
+
+ Value *simplify(Instruction *I, const TargetLibraryInfo *TLI) {
+ return SimplifyInstruction(I, {*TD, TLI, DT});
+ }
+
+ const DataLayout *TD;
+ const DominatorTree *DT;
+ SmallVector<CallInst *, 32> Printfs;
+};
+} // namespace
+
+char AMDGPUPrintfRuntimeBinding::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPUPrintfRuntimeBinding,
+ "amdgpu-printf-runtime-binding", "AMDGPU Printf lowering",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPrintfRuntimeBinding, "amdgpu-printf-runtime-binding",
+ "AMDGPU Printf lowering", false, false)
+
+char &llvm::AMDGPUPrintfRuntimeBindingID = AMDGPUPrintfRuntimeBinding::ID;
+
+namespace llvm {
+ModulePass *createAMDGPUPrintfRuntimeBinding() {
+ return new AMDGPUPrintfRuntimeBinding();
+}
+} // namespace llvm
+
+AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding()
+ : ModulePass(ID), TD(nullptr), DT(nullptr) {
+ initializeAMDGPUPrintfRuntimeBindingPass(*PassRegistry::getPassRegistry());
+}
+
+void AMDGPUPrintfRuntimeBinding::getConversionSpecifiers(
+ SmallVectorImpl<char> &OpConvSpecifiers, StringRef Fmt,
+ size_t NumOps) const {
+ // not all format characters are collected.
+ // At this time the format characters of interest
+ // are %p and %s, which use to know if we
+ // are either storing a literal string or a
+ // pointer to the printf buffer.
+ static const char ConvSpecifiers[] = "cdieEfgGaosuxXp";
+ size_t CurFmtSpecifierIdx = 0;
+ size_t PrevFmtSpecifierIdx = 0;
+
+ while ((CurFmtSpecifierIdx = Fmt.find_first_of(
+ ConvSpecifiers, CurFmtSpecifierIdx)) != StringRef::npos) {
+ bool ArgDump = false;
+ StringRef CurFmt = Fmt.substr(PrevFmtSpecifierIdx,
+ CurFmtSpecifierIdx - PrevFmtSpecifierIdx);
+ size_t pTag = CurFmt.find_last_of("%");
+ if (pTag != StringRef::npos) {
+ ArgDump = true;
+ while (pTag && CurFmt[--pTag] == '%') {
+ ArgDump = !ArgDump;
+ }
+ }
+
+ if (ArgDump)
+ OpConvSpecifiers.push_back(Fmt[CurFmtSpecifierIdx]);
+
+ PrevFmtSpecifierIdx = ++CurFmtSpecifierIdx;
+ }
+}
+
+bool AMDGPUPrintfRuntimeBinding::shouldPrintAsStr(char Specifier,
+ Type *OpType) const {
+ if (Specifier != 's')
+ return false;
+ const PointerType *PT = dyn_cast<PointerType>(OpType);
+ if (!PT || PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+ return false;
+ Type *ElemType = PT->getContainedType(0);
+ if (ElemType->getTypeID() != Type::IntegerTyID)
+ return false;
+ IntegerType *ElemIType = cast<IntegerType>(ElemType);
+ return ElemIType->getBitWidth() == 8;
+}
+
+bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
+ Module &M, function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
+ LLVMContext &Ctx = M.getContext();
+ IRBuilder<> Builder(Ctx);
+ Type *I32Ty = Type::getInt32Ty(Ctx);
+ unsigned UniqID = 0;
+ // NB: This is important for this string size to be divizable by 4
+ const char NonLiteralStr[4] = "???";
+
+ for (auto CI : Printfs) {
+ unsigned NumOps = CI->getNumArgOperands();
+
+ SmallString<16> OpConvSpecifiers;
+ Value *Op = CI->getArgOperand(0);
+
+ if (auto LI = dyn_cast<LoadInst>(Op)) {
+ Op = LI->getPointerOperand();
+ for (auto Use : Op->users()) {
+ if (auto SI = dyn_cast<StoreInst>(Use)) {
+ Op = SI->getValueOperand();
+ break;
+ }
+ }
+ }
+
+ if (auto I = dyn_cast<Instruction>(Op)) {
+ Value *Op_simplified = simplify(I, &GetTLI(*I->getFunction()));
+ if (Op_simplified)
+ Op = Op_simplified;
+ }
+
+ ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Op);
+
+ if (ConstExpr) {
+ GlobalVariable *GVar = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+
+ StringRef Str("unknown");
+ if (GVar && GVar->hasInitializer()) {
+ auto Init = GVar->getInitializer();
+ if (auto CA = dyn_cast<ConstantDataArray>(Init)) {
+ if (CA->isString())
+ Str = CA->getAsCString();
+ } else if (isa<ConstantAggregateZero>(Init)) {
+ Str = "";
+ }
+ //
+ // we need this call to ascertain
+ // that we are printing a string
+ // or a pointer. It takes out the
+ // specifiers and fills up the first
+ // arg
+ getConversionSpecifiers(OpConvSpecifiers, Str, NumOps - 1);
+ }
+ // Add metadata for the string
+ std::string AStreamHolder;
+ raw_string_ostream Sizes(AStreamHolder);
+ int Sum = DWORD_ALIGN;
+ Sizes << CI->getNumArgOperands() - 1;
+ Sizes << ':';
+ for (unsigned ArgCount = 1; ArgCount < CI->getNumArgOperands() &&
+ ArgCount <= OpConvSpecifiers.size();
+ ArgCount++) {
+ Value *Arg = CI->getArgOperand(ArgCount);
+ Type *ArgType = Arg->getType();
+ unsigned ArgSize = TD->getTypeAllocSizeInBits(ArgType);
+ ArgSize = ArgSize / 8;
+ //
+ // ArgSize by design should be a multiple of DWORD_ALIGN,
+ // expand the arguments that do not follow this rule.
+ //
+ if (ArgSize % DWORD_ALIGN != 0) {
+ llvm::Type *ResType = llvm::Type::getInt32Ty(Ctx);
+ VectorType *LLVMVecType = llvm::dyn_cast<llvm::VectorType>(ArgType);
+ int NumElem = LLVMVecType ? LLVMVecType->getNumElements() : 1;
+ if (LLVMVecType && NumElem > 1)
+ ResType = llvm::VectorType::get(ResType, NumElem);
+ Builder.SetInsertPoint(CI);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+ if (OpConvSpecifiers[ArgCount - 1] == 'x' ||
+ OpConvSpecifiers[ArgCount - 1] == 'X' ||
+ OpConvSpecifiers[ArgCount - 1] == 'u' ||
+ OpConvSpecifiers[ArgCount - 1] == 'o')
+ Arg = Builder.CreateZExt(Arg, ResType);
+ else
+ Arg = Builder.CreateSExt(Arg, ResType);
+ ArgType = Arg->getType();
+ ArgSize = TD->getTypeAllocSizeInBits(ArgType);
+ ArgSize = ArgSize / 8;
+ CI->setOperand(ArgCount, Arg);
+ }
+ if (OpConvSpecifiers[ArgCount - 1] == 'f') {
+ ConstantFP *FpCons = dyn_cast<ConstantFP>(Arg);
+ if (FpCons)
+ ArgSize = 4;
+ else {
+ FPExtInst *FpExt = dyn_cast<FPExtInst>(Arg);
+ if (FpExt && FpExt->getType()->isDoubleTy() &&
+ FpExt->getOperand(0)->getType()->isFloatTy())
+ ArgSize = 4;
+ }
+ }
+ if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) {
+ if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
+ GlobalVariable *GV =
+ dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+ if (GV && GV->hasInitializer()) {
+ Constant *Init = GV->getInitializer();
+ ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init);
+ if (Init->isZeroValue() || CA->isString()) {
+ size_t SizeStr = Init->isZeroValue()
+ ? 1
+ : (strlen(CA->getAsCString().data()) + 1);
+ size_t Rem = SizeStr % DWORD_ALIGN;
+ size_t NSizeStr = 0;
+ LLVM_DEBUG(dbgs() << "Printf string original size = " << SizeStr
+ << '\n');
+ if (Rem) {
+ NSizeStr = SizeStr + (DWORD_ALIGN - Rem);
+ } else {
+ NSizeStr = SizeStr;
+ }
+ ArgSize = NSizeStr;
+ }
+ } else {
+ ArgSize = sizeof(NonLiteralStr);
+ }
+ } else {
+ ArgSize = sizeof(NonLiteralStr);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "Printf ArgSize (in buffer) = " << ArgSize
+ << " for type: " << *ArgType << '\n');
+ Sizes << ArgSize << ':';
+ Sum += ArgSize;
+ }
+ LLVM_DEBUG(dbgs() << "Printf format string in source = " << Str.str()
+ << '\n');
+ for (size_t I = 0; I < Str.size(); ++I) {
+ // Rest of the C escape sequences (e.g. \') are handled correctly
+ // by the MDParser
+ switch (Str[I]) {
+ case '\a':
+ Sizes << "\\a";
+ break;
+ case '\b':
+ Sizes << "\\b";
+ break;
+ case '\f':
+ Sizes << "\\f";
+ break;
+ case '\n':
+ Sizes << "\\n";
+ break;
+ case '\r':
+ Sizes << "\\r";
+ break;
+ case '\v':
+ Sizes << "\\v";
+ break;
+ case ':':
+ // ':' cannot be scanned by Flex, as it is defined as a delimiter
+ // Replace it with it's octal representation \72
+ Sizes << "\\72";
+ break;
+ default:
+ Sizes << Str[I];
+ break;
+ }
+ }
+
+ // Insert the printf_alloc call
+ Builder.SetInsertPoint(CI);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ AttributeList Attr = AttributeList::get(Ctx, AttributeList::FunctionIndex,
+ Attribute::NoUnwind);
+
+ Type *SizetTy = Type::getInt32Ty(Ctx);
+
+ Type *Tys_alloc[1] = {SizetTy};
+ Type *I8Ptr = PointerType::get(Type::getInt8Ty(Ctx), 1);
+ FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false);
+ FunctionCallee PrintfAllocFn =
+ M.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, Attr);
+
+ LLVM_DEBUG(dbgs() << "Printf metadata = " << Sizes.str() << '\n');
+ std::string fmtstr = itostr(++UniqID) + ":" + Sizes.str().c_str();
+ MDString *fmtStrArray = MDString::get(Ctx, fmtstr);
+
+ // Instead of creating global variables, the
+ // printf format strings are extracted
+ // and passed as metadata. This avoids
+ // polluting llvm's symbol tables in this module.
+ // Metadata is going to be extracted
+ // by the backend passes and inserted
+ // into the OpenCL binary as appropriate.
+ StringRef amd("llvm.printf.fmts");
+ NamedMDNode *metaD = M.getOrInsertNamedMetadata(amd);
+ MDNode *myMD = MDNode::get(Ctx, fmtStrArray);
+ metaD->addOperand(myMD);
+ Value *sumC = ConstantInt::get(SizetTy, Sum, false);
+ SmallVector<Value *, 1> alloc_args;
+ alloc_args.push_back(sumC);
+ CallInst *pcall =
+ CallInst::Create(PrintfAllocFn, alloc_args, "printf_alloc_fn", CI);
+
+ //
+ // Insert code to split basicblock with a
+ // piece of hammock code.
+ // basicblock splits after buffer overflow check
+ //
+ ConstantPointerNull *zeroIntPtr =
+ ConstantPointerNull::get(PointerType::get(Type::getInt8Ty(Ctx), 1));
+ ICmpInst *cmp =
+ dyn_cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, ""));
+ if (!CI->use_empty()) {
+ Value *result =
+ Builder.CreateSExt(Builder.CreateNot(cmp), I32Ty, "printf_res");
+ CI->replaceAllUsesWith(result);
+ }
+ SplitBlock(CI->getParent(), cmp);
+ Instruction *Brnch =
+ SplitBlockAndInsertIfThen(cmp, cmp->getNextNode(), false);
+
+ Builder.SetInsertPoint(Brnch);
+
+ // store unique printf id in the buffer
+ //
+ SmallVector<Value *, 1> ZeroIdxList;
+ ConstantInt *zeroInt =
+ ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10));
+ ZeroIdxList.push_back(zeroInt);
+
+ GetElementPtrInst *BufferIdx =
+ dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create(
+ nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch));
+
+ Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS);
+ Value *id_gep_cast =
+ new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", Brnch);
+
+ StoreInst *stbuff =
+ new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast);
+ stbuff->insertBefore(Brnch); // to Remove unused variable warning
+
+ SmallVector<Value *, 2> FourthIdxList;
+ ConstantInt *fourInt =
+ ConstantInt::get(Ctx, APInt(32, StringRef("4"), 10));
+
+ FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id
+ // the following GEP is the buffer pointer
+ BufferIdx = cast<GetElementPtrInst>(GetElementPtrInst::Create(
+ nullptr, pcall, FourthIdxList, "PrintBuffGep", Brnch));
+
+ Type *Int32Ty = Type::getInt32Ty(Ctx);
+ Type *Int64Ty = Type::getInt64Ty(Ctx);
+ for (unsigned ArgCount = 1; ArgCount < CI->getNumArgOperands() &&
+ ArgCount <= OpConvSpecifiers.size();
+ ArgCount++) {
+ Value *Arg = CI->getArgOperand(ArgCount);
+ Type *ArgType = Arg->getType();
+ SmallVector<Value *, 32> WhatToStore;
+ if (ArgType->isFPOrFPVectorTy() &&
+ (ArgType->getTypeID() != Type::VectorTyID)) {
+ Type *IType = (ArgType->isFloatTy()) ? Int32Ty : Int64Ty;
+ if (OpConvSpecifiers[ArgCount - 1] == 'f') {
+ ConstantFP *fpCons = dyn_cast<ConstantFP>(Arg);
+ if (fpCons) {
+ APFloat Val(fpCons->getValueAPF());
+ bool Lost = false;
+ Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+ &Lost);
+ Arg = ConstantFP::get(Ctx, Val);
+ IType = Int32Ty;
+ } else {
+ FPExtInst *FpExt = dyn_cast<FPExtInst>(Arg);
+ if (FpExt && FpExt->getType()->isDoubleTy() &&
+ FpExt->getOperand(0)->getType()->isFloatTy()) {
+ Arg = FpExt->getOperand(0);
+ IType = Int32Ty;
+ }
+ }
+ }
+ Arg = new BitCastInst(Arg, IType, "PrintArgFP", Brnch);
+ WhatToStore.push_back(Arg);
+ } else if (ArgType->getTypeID() == Type::PointerTyID) {
+ if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) {
+ const char *S = NonLiteralStr;
+ if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
+ GlobalVariable *GV =
+ dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+ if (GV && GV->hasInitializer()) {
+ Constant *Init = GV->getInitializer();
+ ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init);
+ if (Init->isZeroValue() || CA->isString()) {
+ S = Init->isZeroValue() ? "" : CA->getAsCString().data();
+ }
+ }
+ }
+ size_t SizeStr = strlen(S) + 1;
+ size_t Rem = SizeStr % DWORD_ALIGN;
+ size_t NSizeStr = 0;
+ if (Rem) {
+ NSizeStr = SizeStr + (DWORD_ALIGN - Rem);
+ } else {
+ NSizeStr = SizeStr;
+ }
+ if (S[0]) {
+ char *MyNewStr = new char[NSizeStr]();
+ strcpy(MyNewStr, S);
+ int NumInts = NSizeStr / 4;
+ int CharC = 0;
+ while (NumInts) {
+ int ANum = *(int *)(MyNewStr + CharC);
+ CharC += 4;
+ NumInts--;
+ Value *ANumV = ConstantInt::get(Int32Ty, ANum, false);
+ WhatToStore.push_back(ANumV);
+ }
+ delete[] MyNewStr;
+ } else {
+ // Empty string, give a hint to RT it is no NULL
+ Value *ANumV = ConstantInt::get(Int32Ty, 0xFFFFFF00, false);
+ WhatToStore.push_back(ANumV);
+ }
+ } else {
+ uint64_t Size = TD->getTypeAllocSizeInBits(ArgType);
+ assert((Size == 32 || Size == 64) && "unsupported size");
+ Type *DstType = (Size == 32) ? Int32Ty : Int64Ty;
+ Arg = new PtrToIntInst(Arg, DstType, "PrintArgPtr", Brnch);
+ WhatToStore.push_back(Arg);
+ }
+ } else if (ArgType->getTypeID() == Type::VectorTyID) {
+ Type *IType = NULL;
+ uint32_t EleCount = cast<VectorType>(ArgType)->getNumElements();
+ uint32_t EleSize = ArgType->getScalarSizeInBits();
+ uint32_t TotalSize = EleCount * EleSize;
+ if (EleCount == 3) {
+ IntegerType *Int32Ty = Type::getInt32Ty(ArgType->getContext());
+ Constant *Indices[4] = {
+ ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 1),
+ ConstantInt::get(Int32Ty, 2), ConstantInt::get(Int32Ty, 2)};
+ Constant *Mask = ConstantVector::get(Indices);
+ ShuffleVectorInst *Shuffle = new ShuffleVectorInst(Arg, Arg, Mask);
+ Shuffle->insertBefore(Brnch);
+ Arg = Shuffle;
+ ArgType = Arg->getType();
+ TotalSize += EleSize;
+ }
+ switch (EleSize) {
+ default:
+ EleCount = TotalSize / 64;
+ IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+ break;
+ case 8:
+ if (EleCount >= 8) {
+ EleCount = TotalSize / 64;
+ IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+ } else if (EleCount >= 3) {
+ EleCount = 1;
+ IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext()));
+ } else {
+ EleCount = 1;
+ IType = dyn_cast<Type>(Type::getInt16Ty(ArgType->getContext()));
+ }
+ break;
+ case 16:
+ if (EleCount >= 3) {
+ EleCount = TotalSize / 64;
+ IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+ } else {
+ EleCount = 1;
+ IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext()));
+ }
+ break;
+ }
+ if (EleCount > 1) {
+ IType = dyn_cast<Type>(VectorType::get(IType, EleCount));
+ }
+ Arg = new BitCastInst(Arg, IType, "PrintArgVect", Brnch);
+ WhatToStore.push_back(Arg);
+ } else {
+ WhatToStore.push_back(Arg);
+ }
+ for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) {
+ Value *TheBtCast = WhatToStore[I];
+ unsigned ArgSize =
+ TD->getTypeAllocSizeInBits(TheBtCast->getType()) / 8;
+ SmallVector<Value *, 1> BuffOffset;
+ BuffOffset.push_back(ConstantInt::get(I32Ty, ArgSize));
+
+ Type *ArgPointer = PointerType::get(TheBtCast->getType(), 1);
+ Value *CastedGEP =
+ new BitCastInst(BufferIdx, ArgPointer, "PrintBuffPtrCast", Brnch);
+ StoreInst *StBuff = new StoreInst(TheBtCast, CastedGEP, Brnch);
+ LLVM_DEBUG(dbgs() << "inserting store to printf buffer:\n"
+ << *StBuff << '\n');
+ (void)StBuff;
+ if (I + 1 == E && ArgCount + 1 == CI->getNumArgOperands())
+ break;
+ BufferIdx = dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create(
+ nullptr, BufferIdx, BuffOffset, "PrintBuffNextPtr", Brnch));
+ LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
+ << *BufferIdx << '\n');
+ }
+ }
+ }
+ }
+
+ // erase the printf calls
+ for (auto CI : Printfs)
+ CI->eraseFromParent();
+
+ Printfs.clear();
+ return true;
+}
+
+bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) {
+ Triple TT(M.getTargetTriple());
+ if (TT.getArch() == Triple::r600)
+ return false;
+
+ auto PrintfFunction = M.getFunction("printf");
+ if (!PrintfFunction)
+ return false;
+
+ for (auto &U : PrintfFunction->uses()) {
+ if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
+ if (CI->isCallee(&U))
+ Printfs.push_back(CI);
+ }
+ }
+
+ if (Printfs.empty())
+ return false;
+
+ TD = &M.getDataLayout();
+ auto DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+ return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ };
+
+ return lowerPrintfForGpu(M, GetTLI);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e4c9d6685d4a..3e9dcca114a3 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -801,7 +801,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
GlobalVariable::NotThreadLocal,
AMDGPUAS::LOCAL_ADDRESS);
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- GV->setAlignment(I.getAlignment());
+ GV->setAlignment(MaybeAlign(I.getAlignment()));
Value *TCntY, *TCntZ;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 815cbc5e26ee..4d78188b3dc3 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -17,9 +17,9 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -33,6 +33,7 @@
#include "AMDGPUGenRegisterBankInfo.def"
using namespace llvm;
+using namespace MIPatternMatch;
namespace {
@@ -84,9 +85,11 @@ public:
};
}
-AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
+AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
: AMDGPUGenRegisterBankInfo(),
- TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
+ Subtarget(ST),
+ TRI(Subtarget.getRegisterInfo()),
+ TII(Subtarget.getInstrInfo()) {
// HACK: Until this is fully tablegen'd.
static bool AlreadyInit = false;
@@ -163,11 +166,10 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost(
const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
const TargetRegisterClass &RC) const {
+ if (&RC == &AMDGPU::SReg_1RegClass)
+ return AMDGPU::VCCRegBank;
- if (TRI->isSGPRClass(&RC))
- return getRegBank(AMDGPU::SGPRRegBankID);
-
- return getRegBank(AMDGPU::VGPRRegBankID);
+ return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank;
}
template <unsigned NumOps>
@@ -192,7 +194,8 @@ AMDGPURegisterBankInfo::addMappingFromTable(
Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
}
- unsigned MappingID = 0;
+ // getInstrMapping's default mapping uses ID 1, so start at 2.
+ unsigned MappingID = 2;
for (const auto &Entry : Table) {
for (unsigned I = 0; I < NumOps; ++I) {
int OpIdx = RegSrcOpIdx[I];
@@ -210,7 +213,7 @@ AMDGPURegisterBankInfo::addMappingFromTable(
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_readlane: {
static const OpRegBankEntry<3> Table[2] = {
// Perfectly legal.
@@ -251,7 +254,7 @@ RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_buffer_load: {
static const OpRegBankEntry<3> Table[4] = {
// Perfectly legal.
@@ -303,6 +306,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
}
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
+ // FIXME: Should have no register for immediate
static const OpRegBankEntry<1> Table[2] = {
// Perfectly legal.
{ { AMDGPU::SGPRRegBankID }, 1 },
@@ -319,12 +323,15 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
}
}
-static bool isInstrUniform(const MachineInstr &MI) {
+// FIXME: Returns uniform if there's no source value information. This is
+// probably wrong.
+static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI) {
if (!MI.hasOneMemOperand())
return false;
const MachineMemOperand *MMO = *MI.memoperands_begin();
- return AMDGPUInstrInfo::isUniformMMO(MMO);
+ return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
+ AMDGPUInstrInfo::isUniformMMO(MMO);
}
RegisterBankInfo::InstructionMappings
@@ -337,6 +344,31 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
InstructionMappings AltMappings;
switch (MI.getOpcode()) {
+ case TargetOpcode::G_CONSTANT: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ if (Size == 1) {
+ static const OpRegBankEntry<1> Table[4] = {
+ { { AMDGPU::VGPRRegBankID }, 1 },
+ { { AMDGPU::SGPRRegBankID }, 1 },
+ { { AMDGPU::VCCRegBankID }, 1 },
+ { { AMDGPU::SCCRegBankID }, 1 }
+ };
+
+ return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
+ }
+
+ LLVM_FALLTHROUGH;
+ }
+ case TargetOpcode::G_FCONSTANT:
+ case TargetOpcode::G_FRAME_INDEX:
+ case TargetOpcode::G_GLOBAL_VALUE: {
+ static const OpRegBankEntry<1> Table[2] = {
+ { { AMDGPU::VGPRRegBankID }, 1 },
+ { { AMDGPU::SGPRRegBankID }, 1 }
+ };
+
+ return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
+ }
case TargetOpcode::G_AND:
case TargetOpcode::G_OR:
case TargetOpcode::G_XOR: {
@@ -408,23 +440,29 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
AltMappings.push_back(&VSMapping);
break;
}
- case TargetOpcode::G_LOAD: {
+ case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_ZEXTLOAD:
+ case TargetOpcode::G_SEXTLOAD: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
+ unsigned PtrSize = PtrTy.getSizeInBits();
+ unsigned AS = PtrTy.getAddressSpace();
LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
- // FIXME: Should we be hard coding the size for these mappings?
- if (isInstrUniform(MI)) {
+ if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
+ AS != AMDGPUAS::PRIVATE_ADDRESS) &&
+ isInstrUniformNonExtLoadAlign4(MI)) {
const InstructionMapping &SSMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
2); // Num Operands
AltMappings.push_back(&SSMapping);
}
const InstructionMapping &VVMapping = getInstructionMapping(
2, 1, getOperandsMapping(
- {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
- AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
+ {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
2); // Num Operands
AltMappings.push_back(&VVMapping);
@@ -620,57 +658,53 @@ static LLT getHalfSizedType(LLT Ty) {
///
/// There is additional complexity to try for compare values to identify the
/// unique values used.
-void AMDGPURegisterBankInfo::executeInWaterfallLoop(
- MachineInstr &MI, MachineRegisterInfo &MRI,
- ArrayRef<unsigned> OpIndices) const {
- MachineFunction *MF = MI.getParent()->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- MachineBasicBlock::iterator I(MI);
-
- MachineBasicBlock &MBB = *MI.getParent();
- const DebugLoc &DL = MI.getDebugLoc();
-
- // Use a set to avoid extra readfirstlanes in the case where multiple operands
- // are the same register.
- SmallSet<Register, 4> SGPROperandRegs;
- for (unsigned Op : OpIndices) {
- assert(MI.getOperand(Op).isUse());
- Register Reg = MI.getOperand(Op).getReg();
- const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
- if (OpBank->getID() == AMDGPU::VGPRRegBankID)
- SGPROperandRegs.insert(Reg);
- }
-
- // No operands need to be replaced, so no need to loop.
- if (SGPROperandRegs.empty())
- return;
-
- MachineIRBuilder B(MI);
+bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
+ MachineIRBuilder &B,
+ iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs,
+ MachineRegisterInfo &MRI) const {
SmallVector<Register, 4> ResultRegs;
SmallVector<Register, 4> InitResultRegs;
SmallVector<Register, 4> PhiRegs;
- for (MachineOperand &Def : MI.defs()) {
- LLT ResTy = MRI.getType(Def.getReg());
- const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
- ResultRegs.push_back(Def.getReg());
- Register InitReg = B.buildUndef(ResTy).getReg(0);
- Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
- InitResultRegs.push_back(InitReg);
- PhiRegs.push_back(PhiReg);
- MRI.setRegBank(PhiReg, *DefBank);
- MRI.setRegBank(InitReg, *DefBank);
+
+ MachineBasicBlock &MBB = B.getMBB();
+ MachineFunction *MF = &B.getMF();
+
+ const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
+ const unsigned WaveAndOpc = Subtarget.isWave32() ?
+ AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ const unsigned MovTermOpc = Subtarget.isWave32() ?
+ AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
+ const unsigned XorTermOpc = Subtarget.isWave32() ?
+ AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
+ const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
+ AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+ const unsigned ExecReg = Subtarget.isWave32() ?
+ AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+ for (MachineInstr &MI : Range) {
+ for (MachineOperand &Def : MI.defs()) {
+ LLT ResTy = MRI.getType(Def.getReg());
+ const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
+ ResultRegs.push_back(Def.getReg());
+ Register InitReg = B.buildUndef(ResTy).getReg(0);
+ Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
+ InitResultRegs.push_back(InitReg);
+ PhiRegs.push_back(PhiReg);
+ MRI.setRegBank(PhiReg, *DefBank);
+ MRI.setRegBank(InitReg, *DefBank);
+ }
}
- Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
+ Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
// Don't bother using generic instructions/registers for the exec mask.
B.buildInstr(TargetOpcode::IMPLICIT_DEF)
.addDef(InitSaveExecReg);
- Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register PhiExec = MRI.createVirtualRegister(WaveRC);
+ Register NewExec = MRI.createVirtualRegister(WaveRC);
// To insert the loop we need to split the block. Move everything before this
// point to a new block, and insert a new empty block before this instruction.
@@ -688,7 +722,7 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop(
// Move the rest of the block into a new block.
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
- RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+ RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
MBB.addSuccessor(LoopBB);
RestoreExecBB->addSuccessor(RemainderBB);
@@ -711,164 +745,173 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop(
.addMBB(LoopBB);
}
- // Move the instruction into the loop.
- LoopBB->splice(LoopBB->end(), &MBB, I);
- I = std::prev(LoopBB->end());
+ const DebugLoc &DL = B.getDL();
+
+ // Figure out the iterator range after splicing the instructions.
+ auto NewBegin = std::prev(LoopBB->end());
- B.setInstr(*I);
+ // Move the instruction into the loop. Note we moved everything after
+ // Range.end() already into a new block, so Range.end() is no longer valid.
+ LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
+
+ auto NewEnd = LoopBB->end();
+
+ MachineBasicBlock::iterator I = Range.begin();
+ B.setInsertPt(*LoopBB, I);
Register CondReg;
- for (MachineOperand &Op : MI.uses()) {
- if (!Op.isReg())
- continue;
+ for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
+ for (MachineOperand &Op : MI.uses()) {
+ if (!Op.isReg() || Op.isDef())
+ continue;
- assert(!Op.isDef());
- if (SGPROperandRegs.count(Op.getReg())) {
- LLT OpTy = MRI.getType(Op.getReg());
- unsigned OpSize = OpTy.getSizeInBits();
-
- // Can only do a readlane of 32-bit pieces.
- if (OpSize == 32) {
- // Avoid extra copies in the simple case of one 32-bit register.
- Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- MRI.setType(CurrentLaneOpReg, OpTy);
-
- constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg)
- .addReg(Op.getReg());
-
- Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- bool First = CondReg == AMDGPU::NoRegister;
- if (First)
- CondReg = NewCondReg;
-
- // Compare the just read M0 value to all possible Idx values.
- B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
- .addDef(NewCondReg)
- .addReg(CurrentLaneOpReg)
- .addReg(Op.getReg());
- Op.setReg(CurrentLaneOpReg);
-
- if (!First) {
- Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
-
- // If there are multiple operands to consider, and the conditions.
- B.buildInstr(AMDGPU::S_AND_B64)
- .addDef(AndReg)
- .addReg(NewCondReg)
- .addReg(CondReg);
- CondReg = AndReg;
- }
- } else {
- LLT S32 = LLT::scalar(32);
- SmallVector<Register, 8> ReadlanePieces;
+ if (SGPROperandRegs.count(Op.getReg())) {
+ LLT OpTy = MRI.getType(Op.getReg());
+ unsigned OpSize = OpTy.getSizeInBits();
- // The compares can be done as 64-bit, but the extract needs to be done
- // in 32-bit pieces.
+ // Can only do a readlane of 32-bit pieces.
+ if (OpSize == 32) {
+ // Avoid extra copies in the simple case of one 32-bit register.
+ Register CurrentLaneOpReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ MRI.setType(CurrentLaneOpReg, OpTy);
- bool Is64 = OpSize % 64 == 0;
+ constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpReg)
+ .addReg(Op.getReg());
- LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
- unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
- : AMDGPU::V_CMP_EQ_U32_e64;
+ Register NewCondReg = MRI.createVirtualRegister(WaveRC);
+ bool First = CondReg == AMDGPU::NoRegister;
+ if (First)
+ CondReg = NewCondReg;
- // The compares can be done as 64-bit, but the extract needs to be done
- // in 32-bit pieces.
+ // Compare the just read M0 value to all possible Idx values.
+ B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
+ .addDef(NewCondReg)
+ .addReg(CurrentLaneOpReg)
+ .addReg(Op.getReg());
+ Op.setReg(CurrentLaneOpReg);
- // Insert the unmerge before the loop.
+ if (!First) {
+ Register AndReg = MRI.createVirtualRegister(WaveRC);
- B.setMBB(MBB);
- auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
- B.setInstr(*I);
+ // If there are multiple operands to consider, and the conditions.
+ B.buildInstr(WaveAndOpc)
+ .addDef(AndReg)
+ .addReg(NewCondReg)
+ .addReg(CondReg);
+ CondReg = AndReg;
+ }
+ } else {
+ LLT S32 = LLT::scalar(32);
+ SmallVector<Register, 8> ReadlanePieces;
- unsigned NumPieces = Unmerge->getNumOperands() - 1;
- for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
- unsigned UnmergePiece = Unmerge.getReg(PieceIdx);
+ // The compares can be done as 64-bit, but the extract needs to be done
+ // in 32-bit pieces.
- Register CurrentLaneOpReg;
- if (Is64) {
- Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
- Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
+ bool Is64 = OpSize % 64 == 0;
- MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
- MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
- MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
+ LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
+ unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
+ : AMDGPU::V_CMP_EQ_U32_e64;
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpRegLo)
- .addReg(UnmergePiece, 0, AMDGPU::sub0);
+ // The compares can be done as 64-bit, but the extract needs to be done
+ // in 32-bit pieces.
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpRegHi)
- .addReg(UnmergePiece, 0, AMDGPU::sub1);
+ // Insert the unmerge before the loop.
- CurrentLaneOpReg =
- B.buildMerge(LLT::scalar(64),
- {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
- .getReg(0);
+ B.setMBB(MBB);
+ auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
+ B.setInstr(*I);
- MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
+ unsigned NumPieces = Unmerge->getNumOperands() - 1;
+ for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
+ Register UnmergePiece = Unmerge.getReg(PieceIdx);
- if (OpTy.getScalarSizeInBits() == 64) {
- // If we need to produce a 64-bit element vector, so use the
- // merged pieces
- ReadlanePieces.push_back(CurrentLaneOpReg);
+ Register CurrentLaneOpReg;
+ if (Is64) {
+ Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
+ Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
+
+ MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
+ MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
+ MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpRegLo)
+ .addReg(UnmergePiece, 0, AMDGPU::sub0);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpRegHi)
+ .addReg(UnmergePiece, 0, AMDGPU::sub1);
+
+ CurrentLaneOpReg =
+ B.buildMerge(LLT::scalar(64),
+ {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
+ .getReg(0);
+
+ MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
+
+ if (OpTy.getScalarSizeInBits() == 64) {
+ // If we need to produce a 64-bit element vector, so use the
+ // merged pieces
+ ReadlanePieces.push_back(CurrentLaneOpReg);
+ } else {
+ // 32-bit element type.
+ ReadlanePieces.push_back(CurrentLaneOpRegLo);
+ ReadlanePieces.push_back(CurrentLaneOpRegHi);
+ }
} else {
- // 32-bit element type.
- ReadlanePieces.push_back(CurrentLaneOpRegLo);
- ReadlanePieces.push_back(CurrentLaneOpRegHi);
+ CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpReg)
+ .addReg(UnmergePiece);
+ ReadlanePieces.push_back(CurrentLaneOpReg);
}
- } else {
- CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
- MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
- MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpReg)
- .addReg(UnmergePiece);
- ReadlanePieces.push_back(CurrentLaneOpReg);
- }
+ Register NewCondReg = MRI.createVirtualRegister(WaveRC);
+ bool First = CondReg == AMDGPU::NoRegister;
+ if (First)
+ CondReg = NewCondReg;
- Register NewCondReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- bool First = CondReg == AMDGPU::NoRegister;
- if (First)
- CondReg = NewCondReg;
+ B.buildInstr(CmpOp)
+ .addDef(NewCondReg)
+ .addReg(CurrentLaneOpReg)
+ .addReg(UnmergePiece);
- B.buildInstr(CmpOp)
- .addDef(NewCondReg)
- .addReg(CurrentLaneOpReg)
- .addReg(UnmergePiece);
+ if (!First) {
+ Register AndReg = MRI.createVirtualRegister(WaveRC);
- if (!First) {
- Register AndReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ // If there are multiple operands to consider, and the conditions.
+ B.buildInstr(WaveAndOpc)
+ .addDef(AndReg)
+ .addReg(NewCondReg)
+ .addReg(CondReg);
+ CondReg = AndReg;
+ }
+ }
- // If there are multiple operands to consider, and the conditions.
- B.buildInstr(AMDGPU::S_AND_B64)
- .addDef(AndReg)
- .addReg(NewCondReg)
- .addReg(CondReg);
- CondReg = AndReg;
+ // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
+ // BUILD_VECTOR
+ if (OpTy.isVector()) {
+ auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
+ Op.setReg(Merge.getReg(0));
+ } else {
+ auto Merge = B.buildMerge(OpTy, ReadlanePieces);
+ Op.setReg(Merge.getReg(0));
}
- }
- // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
- // BUILD_VECTOR
- if (OpTy.isVector()) {
- auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
- Op.setReg(Merge.getReg(0));
- } else {
- auto Merge = B.buildMerge(OpTy, ReadlanePieces);
- Op.setReg(Merge.getReg(0));
+ MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
}
-
- MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
}
}
}
@@ -876,16 +919,16 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop(
B.setInsertPt(*LoopBB, LoopBB->end());
// Update EXEC, save the original EXEC value to VCC.
- B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64)
+ B.buildInstr(AndSaveExecOpc)
.addDef(NewExec)
.addReg(CondReg, RegState::Kill);
MRI.setSimpleHint(NewExec, CondReg);
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
- B.buildInstr(AMDGPU::S_XOR_B64_term)
- .addDef(AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ B.buildInstr(XorTermOpc)
+ .addDef(ExecReg)
+ .addReg(ExecReg)
.addReg(NewExec);
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
@@ -896,14 +939,60 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop(
.addMBB(LoopBB);
// Save the EXEC mask before the loop.
- BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg)
- .addReg(AMDGPU::EXEC);
+ BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
+ .addReg(ExecReg);
// Restore the EXEC mask after the loop.
B.setMBB(*RestoreExecBB);
- B.buildInstr(AMDGPU::S_MOV_B64_term)
- .addDef(AMDGPU::EXEC)
+ B.buildInstr(MovTermOpc)
+ .addDef(ExecReg)
.addReg(SaveExecReg);
+
+ // Restore the insert point before the original instruction.
+ B.setInsertPt(MBB, MBB.end());
+
+ return true;
+}
+
+// Return any unique registers used by \p MI at \p OpIndices that need to be
+// handled in a waterfall loop. Returns these registers in \p
+// SGPROperandRegs. Returns true if there are any operansd to handle and a
+// waterfall loop is necessary.
+bool AMDGPURegisterBankInfo::collectWaterfallOperands(
+ SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
+ MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
+ for (unsigned Op : OpIndices) {
+ assert(MI.getOperand(Op).isUse());
+ Register Reg = MI.getOperand(Op).getReg();
+ const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
+ if (OpBank->getID() == AMDGPU::VGPRRegBankID)
+ SGPROperandRegs.insert(Reg);
+ }
+
+ // No operands need to be replaced, so no need to loop.
+ return !SGPROperandRegs.empty();
+}
+
+bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
+ MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const {
+ // Use a set to avoid extra readfirstlanes in the case where multiple operands
+ // are the same register.
+ SmallSet<Register, 4> SGPROperandRegs;
+
+ if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
+ return false;
+
+ MachineBasicBlock::iterator I = MI.getIterator();
+ return executeInWaterfallLoop(B, make_range(I, std::next(I)),
+ SGPROperandRegs, MRI);
+}
+
+bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const {
+ MachineIRBuilder B(MI);
+ return executeInWaterfallLoop(B, MI, MRI, OpIndices);
}
// Legalize an operand that must be an SGPR by inserting a readfirstlane.
@@ -960,8 +1049,13 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
// If the pointer is an SGPR, we have nothing to do.
- if (SrcRegs.empty())
- return false;
+ if (SrcRegs.empty()) {
+ Register PtrReg = MI.getOperand(1).getReg();
+ const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
+ if (PtrBank == &AMDGPU::SGPRRegBank)
+ return false;
+ SrcRegs.push_back(PtrReg);
+ }
assert(LoadSize % MaxNonSmrdLoadSize == 0);
@@ -1013,6 +1107,33 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
return true;
}
+bool AMDGPURegisterBankInfo::applyMappingImage(
+ MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI, int RsrcIdx) const {
+ const int NumDefs = MI.getNumExplicitDefs();
+
+ // The reported argument index is relative to the IR intrinsic call arguments,
+ // so we need to shift by the number of defs and the intrinsic ID.
+ RsrcIdx += NumDefs + 1;
+
+ // Insert copies to VGPR arguments.
+ applyDefaultMapping(OpdMapper);
+
+ // Fixup any SGPR arguments.
+ SmallVector<unsigned, 4> SGPRIndexes;
+ for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
+ if (!MI.getOperand(I).isReg())
+ continue;
+
+ // If this intrinsic has a sampler, it immediately follows rsrc.
+ if (I == RsrcIdx || I == RsrcIdx + 1)
+ SGPRIndexes.push_back(I);
+ }
+
+ executeInWaterfallLoop(MI, MRI, SGPRIndexes);
+ return true;
+}
+
// For cases where only a single copy is inserted for matching register banks.
// Replace the register in the instruction operand
static void substituteSimpleCopyRegs(
@@ -1024,6 +1145,184 @@ static void substituteSimpleCopyRegs(
}
}
+/// Handle register layout difference for f16 images for some subtargets.
+Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
+ MachineRegisterInfo &MRI,
+ Register Reg) const {
+ if (!Subtarget.hasUnpackedD16VMem())
+ return Reg;
+
+ const LLT S16 = LLT::scalar(16);
+ LLT StoreVT = MRI.getType(Reg);
+ if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
+ return Reg;
+
+ auto Unmerge = B.buildUnmerge(S16, Reg);
+
+
+ SmallVector<Register, 4> WideRegs;
+ for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+ WideRegs.push_back(Unmerge.getReg(I));
+
+ const LLT S32 = LLT::scalar(32);
+ int NumElts = StoreVT.getNumElements();
+
+ return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+}
+
+static std::pair<Register, unsigned>
+getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
+ int64_t Const;
+ if (mi_match(Reg, MRI, m_ICst(Const)))
+ return std::make_pair(Register(), Const);
+
+ Register Base;
+ if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
+ return std::make_pair(Base, Const);
+
+ // TODO: Handle G_OR used for add case
+ return std::make_pair(Reg, 0);
+}
+
+std::pair<Register, unsigned>
+AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
+ Register OrigOffset) const {
+ const unsigned MaxImm = 4095;
+ Register BaseReg;
+ unsigned ImmOffset;
+ const LLT S32 = LLT::scalar(32);
+
+ std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
+ OrigOffset);
+
+ unsigned C1 = 0;
+ if (ImmOffset != 0) {
+ // If the immediate value is too big for the immoffset field, put the value
+ // and -4096 into the immoffset field so that the value that is copied/added
+ // for the voffset field is a multiple of 4096, and it stands more chance
+ // of being CSEd with the copy/add for another similar load/store.
+ // However, do not do that rounding down to a multiple of 4096 if that is a
+ // negative number, as it appears to be illegal to have a negative offset
+ // in the vgpr, even if adding the immediate offset makes it positive.
+ unsigned Overflow = ImmOffset & ~MaxImm;
+ ImmOffset -= Overflow;
+ if ((int32_t)Overflow < 0) {
+ Overflow += ImmOffset;
+ ImmOffset = 0;
+ }
+
+ C1 = ImmOffset;
+ if (Overflow != 0) {
+ if (!BaseReg)
+ BaseReg = B.buildConstant(S32, Overflow).getReg(0);
+ else {
+ auto OverflowVal = B.buildConstant(S32, Overflow);
+ BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
+ }
+ }
+ }
+
+ if (!BaseReg)
+ BaseReg = B.buildConstant(S32, 0).getReg(0);
+
+ return {BaseReg, C1};
+}
+
+static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
+ int64_t C;
+ return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
+}
+
+static unsigned extractGLC(unsigned CachePolicy) {
+ return CachePolicy & 1;
+}
+
+static unsigned extractSLC(unsigned CachePolicy) {
+ return (CachePolicy >> 1) & 1;
+}
+
+static unsigned extractDLC(unsigned CachePolicy) {
+ return (CachePolicy >> 2) & 1;
+}
+
+MachineInstr *
+AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
+ MachineInstr &MI) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ executeInWaterfallLoop(B, MI, MRI, {2, 4});
+
+ // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
+
+ Register VData = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(VData);
+
+ int EltSize = Ty.getScalarSizeInBits();
+ int Size = Ty.getSizeInBits();
+
+ // FIXME: Broken integer truncstore.
+ if (EltSize != 32)
+ report_fatal_error("unhandled intrinsic store");
+
+ // FIXME: Verifier should enforce 1 MMO for these intrinsics.
+ const int MemSize = (*MI.memoperands_begin())->getSize();
+
+
+ Register RSrc = MI.getOperand(2).getReg();
+ Register VOffset = MI.getOperand(3).getReg();
+ Register SOffset = MI.getOperand(4).getReg();
+ unsigned CachePolicy = MI.getOperand(5).getImm();
+
+ unsigned ImmOffset;
+ std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
+
+ const bool Offen = !isZero(VOffset, MRI);
+
+ unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
+ switch (8 * MemSize) {
+ case 8:
+ Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
+ break;
+ case 16:
+ Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
+ break;
+ default:
+ Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
+ if (Size > 32)
+ Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
+ break;
+ }
+
+
+ // Set the insertion point back to the instruction in case it was moved into a
+ // loop.
+ B.setInstr(MI);
+
+ MachineInstrBuilder MIB = B.buildInstr(Opc)
+ .addUse(VData);
+
+ if (Offen)
+ MIB.addUse(VOffset);
+
+ MIB.addUse(RSrc)
+ .addUse(SOffset)
+ .addImm(ImmOffset)
+ .addImm(extractGLC(CachePolicy))
+ .addImm(extractSLC(CachePolicy))
+ .addImm(0) // tfe: FIXME: Remove from inst
+ .addImm(extractDLC(CachePolicy))
+ .cloneMemRefs(MI);
+
+ // FIXME: We need a way to report failure from applyMappingImpl.
+ // Insert constrain copies before inserting the loop.
+ if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
+ report_fatal_error("failed to constrain selected store intrinsic");
+
+ return MIB;
+}
+
void AMDGPURegisterBankInfo::applyMappingImpl(
const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
@@ -1289,12 +1588,202 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MI.eraseFromParent();
return;
}
- case AMDGPU::G_EXTRACT_VECTOR_ELT:
- applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(MI, MRI, { 2 });
+ case AMDGPU::G_BUILD_VECTOR:
+ case AMDGPU::G_BUILD_VECTOR_TRUNC: {
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ if (DstTy != LLT::vector(2, 16))
+ break;
+
+ assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
+ substituteSimpleCopyRegs(OpdMapper, 1);
+ substituteSimpleCopyRegs(OpdMapper, 2);
+
+ const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
+ if (DstBank == &AMDGPU::SGPRRegBank)
+ break; // Can use S_PACK_* instructions.
+
+ MachineIRBuilder B(MI);
+
+ Register Lo = MI.getOperand(1).getReg();
+ Register Hi = MI.getOperand(2).getReg();
+ const LLT S32 = LLT::scalar(32);
+
+ const RegisterBank *BankLo = getRegBank(Lo, MRI, *TRI);
+ const RegisterBank *BankHi = getRegBank(Hi, MRI, *TRI);
+
+ Register ZextLo;
+ Register ShiftHi;
+
+ if (Opc == AMDGPU::G_BUILD_VECTOR) {
+ ZextLo = B.buildZExt(S32, Lo).getReg(0);
+ MRI.setRegBank(ZextLo, *BankLo);
+
+ Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
+ MRI.setRegBank(ZextHi, *BankHi);
+
+ auto ShiftAmt = B.buildConstant(S32, 16);
+ MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
+
+ ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
+ MRI.setRegBank(ShiftHi, *BankHi);
+ } else {
+ Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
+ MRI.setRegBank(MaskLo, *BankLo);
+
+ auto ShiftAmt = B.buildConstant(S32, 16);
+ MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
+
+ ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
+ MRI.setRegBank(ShiftHi, *BankHi);
+
+ ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
+ MRI.setRegBank(ZextLo, *BankLo);
+ }
+
+ auto Or = B.buildOr(S32, ZextLo, ShiftHi);
+ MRI.setRegBank(Or.getReg(0), *DstBank);
+
+ B.buildBitcast(DstReg, Or);
+ MI.eraseFromParent();
+ return;
+ }
+ case AMDGPU::G_EXTRACT_VECTOR_ELT: {
+ SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
+
+ assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
+
+ if (DstRegs.empty()) {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, { 2 });
+ return;
+ }
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register IdxReg = MI.getOperand(2).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ (void)DstTy;
+
+ assert(DstTy.getSizeInBits() == 64);
+
+ LLT SrcTy = MRI.getType(SrcReg);
+ const LLT S32 = LLT::scalar(32);
+ LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
+
+ MachineIRBuilder B(MI);
+ auto CastSrc = B.buildBitcast(Vec32, SrcReg);
+ auto One = B.buildConstant(S32, 1);
+
+ // Split the vector index into 32-bit pieces. Prepare to move all of the
+ // new instructions into a waterfall loop if necessary.
+ //
+ // Don't put the bitcast or constant in the loop.
+ MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
+
+ // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
+ auto IdxLo = B.buildShl(S32, IdxReg, One);
+ auto IdxHi = B.buildAdd(S32, IdxLo, One);
+ B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
+ B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
+
+ const ValueMapping &DstMapping
+ = OpdMapper.getInstrMapping().getOperandMapping(0);
+
+ // FIXME: Should be getting from mapping or not?
+ const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
+ MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank);
+ MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
+ MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
+ MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
+ MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
+
+ SmallSet<Register, 4> OpsToWaterfall;
+ if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
+ MI.eraseFromParent();
+ return;
+ }
+
+ // Remove the original instruction to avoid potentially confusing the
+ // waterfall loop logic.
+ B.setInstr(*Span.begin());
+ MI.eraseFromParent();
+ executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
+ OpsToWaterfall, MRI);
+ return;
+ }
+ case AMDGPU::G_INSERT_VECTOR_ELT: {
+ SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
+
+ assert(OpdMapper.getVRegs(0).empty());
+ assert(OpdMapper.getVRegs(1).empty());
+ assert(OpdMapper.getVRegs(3).empty());
+
+ if (InsRegs.empty()) {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, { 3 });
+ return;
+ }
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register InsReg = MI.getOperand(2).getReg();
+ Register IdxReg = MI.getOperand(3).getReg();
+ LLT SrcTy = MRI.getType(SrcReg);
+ LLT InsTy = MRI.getType(InsReg);
+ (void)InsTy;
+
+ assert(InsTy.getSizeInBits() == 64);
+
+ const LLT S32 = LLT::scalar(32);
+ LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
+
+ MachineIRBuilder B(MI);
+ auto CastSrc = B.buildBitcast(Vec32, SrcReg);
+ auto One = B.buildConstant(S32, 1);
+
+ // Split the vector index into 32-bit pieces. Prepare to move all of the
+ // new instructions into a waterfall loop if necessary.
+ //
+ // Don't put the bitcast or constant in the loop.
+ MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
+
+ // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
+ auto IdxLo = B.buildShl(S32, IdxReg, One);
+ auto IdxHi = B.buildAdd(S32, IdxLo, One);
+
+ auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
+ auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
+ B.buildBitcast(DstReg, InsHi);
+
+ const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
+ const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
+ const RegisterBank *InsSrcBank = getRegBank(InsReg, MRI, *TRI);
+
+ MRI.setRegBank(InsReg, *InsSrcBank);
+ MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
+ MRI.setRegBank(InsLo.getReg(0), *DstBank);
+ MRI.setRegBank(InsHi.getReg(0), *DstBank);
+ MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
+ MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
+ MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
+
+
+ SmallSet<Register, 4> OpsToWaterfall;
+ if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
+ MI.eraseFromParent();
+ return;
+ }
+
+ B.setInstr(*Span.begin());
+ MI.eraseFromParent();
+
+ executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
+ OpsToWaterfall, MRI);
return;
+ }
case AMDGPU::G_INTRINSIC: {
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_s_buffer_load: {
// FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
executeInWaterfallLoop(MI, MRI, { 2, 3 });
@@ -1303,8 +1792,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case Intrinsic::amdgcn_readlane: {
substituteSimpleCopyRegs(OpdMapper, 2);
- assert(empty(OpdMapper.getVRegs(0)));
- assert(empty(OpdMapper.getVRegs(3)));
+ assert(OpdMapper.getVRegs(0).empty());
+ assert(OpdMapper.getVRegs(3).empty());
// Make sure the index is an SGPR. It doesn't make sense to run this in a
// waterfall loop, so assume it's a uniform value.
@@ -1312,9 +1801,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case Intrinsic::amdgcn_writelane: {
- assert(empty(OpdMapper.getVRegs(0)));
- assert(empty(OpdMapper.getVRegs(2)));
- assert(empty(OpdMapper.getVRegs(3)));
+ assert(OpdMapper.getVRegs(0).empty());
+ assert(OpdMapper.getVRegs(2).empty());
+ assert(OpdMapper.getVRegs(3).empty());
substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
@@ -1327,7 +1816,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
break;
}
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ auto IntrID = MI.getIntrinsicID();
+ switch (IntrID) {
case Intrinsic::amdgcn_buffer_load: {
executeInWaterfallLoop(MI, MRI, { 2 });
return;
@@ -1335,23 +1825,70 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
// This is only allowed to execute with 1 lane, so readfirstlane is safe.
- assert(empty(OpdMapper.getVRegs(0)));
+ assert(OpdMapper.getVRegs(0).empty());
substituteSimpleCopyRegs(OpdMapper, 3);
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
+ case Intrinsic::amdgcn_ds_gws_init:
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ case Intrinsic::amdgcn_ds_gws_sema_br: {
+ // Only the first lane is executes, so readfirstlane is safe.
+ substituteSimpleCopyRegs(OpdMapper, 1);
+ constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ return;
+ }
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ case Intrinsic::amdgcn_ds_gws_sema_release_all: {
+ // Only the first lane is executes, so readfirstlane is safe.
+ constrainOpWithReadfirstlane(MI, MRI, 1); // M0
+ return;
+ }
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// FIXME: Should this use a waterfall loop?
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
- default:
+ case Intrinsic::amdgcn_raw_buffer_load:
+ case Intrinsic::amdgcn_raw_buffer_load_format:
+ case Intrinsic::amdgcn_raw_tbuffer_load:
+ case Intrinsic::amdgcn_raw_buffer_store:
+ case Intrinsic::amdgcn_raw_buffer_store_format:
+ case Intrinsic::amdgcn_raw_tbuffer_store: {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, {2, 4});
+ return;
+ }
+ case Intrinsic::amdgcn_struct_buffer_load:
+ case Intrinsic::amdgcn_struct_buffer_store:
+ case Intrinsic::amdgcn_struct_tbuffer_load:
+ case Intrinsic::amdgcn_struct_tbuffer_store: {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, {2, 5});
+ return;
+ }
+ default: {
+ if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
+ AMDGPU::lookupRsrcIntrinsic(IntrID)) {
+ // Non-images can have complications from operands that allow both SGPR
+ // and VGPR. For now it's too complicated to figure out the final opcode
+ // to derive the register bank from the MCInstrDesc.
+ if (RSrcIntrin->IsImage) {
+ applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
+ return;
+ }
+ }
+
break;
}
+ }
break;
}
- case AMDGPU::G_LOAD: {
+ case AMDGPU::G_LOAD:
+ case AMDGPU::G_ZEXTLOAD:
+ case AMDGPU::G_SEXTLOAD: {
if (applyMappingWideLoad(MI, OpdMapper, MRI))
return;
break;
@@ -1452,25 +1989,71 @@ AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
}
const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
+ const MachineInstr &MI,
+ int RsrcIdx) const {
+ // The reported argument index is relative to the IR intrinsic call arguments,
+ // so we need to shift by the number of defs and the intrinsic ID.
+ RsrcIdx += MI.getNumExplicitDefs() + 1;
+
+ const int NumOps = MI.getNumOperands();
+ SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
+
+ // TODO: Should packed/unpacked D16 difference be reported here as part of
+ // the value mapping?
+ for (int I = 0; I != NumOps; ++I) {
+ if (!MI.getOperand(I).isReg())
+ continue;
+
+ Register OpReg = MI.getOperand(I).getReg();
+ unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
+
+ // FIXME: Probably need a new intrinsic register bank searchable table to
+ // handle arbitrary intrinsics easily.
+ //
+ // If this has a sampler, it immediately follows rsrc.
+ const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
+
+ if (MustBeSGPR) {
+ // If this must be an SGPR, so we must report whatever it is as legal.
+ unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
+ } else {
+ // Some operands must be VGPR, and these are easy to copy to.
+ OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ }
+ }
+
+ return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
+}
+
+const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
- SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+ SmallVector<const ValueMapping*, 2> OpdsMapping(2);
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
- unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+ Register PtrReg = MI.getOperand(1).getReg();
+ LLT PtrTy = MRI.getType(PtrReg);
+ unsigned AS = PtrTy.getAddressSpace();
+ unsigned PtrSize = PtrTy.getSizeInBits();
const ValueMapping *ValMapping;
const ValueMapping *PtrMapping;
- if (isInstrUniform(MI)) {
+ const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
+
+ if (PtrBank == &AMDGPU::SGPRRegBank &&
+ (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
+ AS != AMDGPUAS::PRIVATE_ADDRESS) &&
+ isInstrUniformNonExtLoadAlign4(MI)) {
// We have a uniform instruction so we want to use an SMRD load
ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
} else {
ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
- // FIXME: What would happen if we used SGPRRegBankID here?
PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
}
@@ -1494,6 +2077,31 @@ AMDGPURegisterBankInfo::getRegBankID(Register Reg,
return Bank ? Bank->getID() : Default;
}
+
+static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
+ return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
+ AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+}
+
+const RegisterBankInfo::ValueMapping *
+AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const {
+ // Lie and claim anything is legal, even though this needs to be an SGPR
+ // applyMapping will have to deal with it as a waterfall loop.
+ unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
+ unsigned Size = getSizeInBits(Reg, MRI, TRI);
+ return AMDGPU::getValueMapping(Bank, Size);
+}
+
+const RegisterBankInfo::ValueMapping *
+AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const {
+ unsigned Size = getSizeInBits(Reg, MRI, TRI);
+ return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+}
+
///
/// This function must return a legal mapping, because
/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
@@ -1536,7 +2144,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
int ResultBank = -1;
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
- unsigned Reg = MI.getOperand(I).getReg();
+ Register Reg = MI.getOperand(I).getReg();
const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
// FIXME: Assuming VGPR for any undetermined inputs.
@@ -1660,7 +2268,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
LLVM_FALLTHROUGH;
}
-
case AMDGPU::G_GEP:
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
@@ -1669,15 +2276,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_LSHR:
case AMDGPU::G_ASHR:
case AMDGPU::G_UADDO:
- case AMDGPU::G_SADDO:
case AMDGPU::G_USUBO:
- case AMDGPU::G_SSUBO:
case AMDGPU::G_UADDE:
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE:
- case AMDGPU::G_UMULH:
- case AMDGPU::G_SMULH:
case AMDGPU::G_SMIN:
case AMDGPU::G_SMAX:
case AMDGPU::G_UMIN:
@@ -1692,17 +2295,32 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_FPTOUI:
case AMDGPU::G_FMUL:
case AMDGPU::G_FMA:
+ case AMDGPU::G_FMAD:
case AMDGPU::G_FSQRT:
+ case AMDGPU::G_FFLOOR:
+ case AMDGPU::G_FCEIL:
+ case AMDGPU::G_FRINT:
case AMDGPU::G_SITOFP:
case AMDGPU::G_UITOFP:
case AMDGPU::G_FPTRUNC:
case AMDGPU::G_FPEXT:
case AMDGPU::G_FEXP2:
case AMDGPU::G_FLOG2:
+ case AMDGPU::G_FMINNUM:
+ case AMDGPU::G_FMAXNUM:
+ case AMDGPU::G_FMINNUM_IEEE:
+ case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FCANONICALIZE:
case AMDGPU::G_INTRINSIC_TRUNC:
case AMDGPU::G_INTRINSIC_ROUND:
+ case AMDGPU::G_AMDGPU_FFBH_U32:
+ return getDefaultMappingVOP(MI);
+ case AMDGPU::G_UMULH:
+ case AMDGPU::G_SMULH: {
+ if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
+ }
case AMDGPU::G_IMPLICIT_DEF: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
@@ -1710,12 +2328,19 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case AMDGPU::G_FCONSTANT:
case AMDGPU::G_CONSTANT:
- case AMDGPU::G_FRAME_INDEX:
+ case AMDGPU::G_GLOBAL_VALUE:
case AMDGPU::G_BLOCK_ADDR: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
+ case AMDGPU::G_FRAME_INDEX: {
+ // TODO: This should be the same as other constants, but eliminateFrameIndex
+ // currently assumes VALU uses.
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ break;
+ }
case AMDGPU::G_INSERT: {
unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
AMDGPU::VGPRRegBankID;
@@ -1737,8 +2362,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = nullptr;
break;
}
- case AMDGPU::G_MERGE_VALUES:
case AMDGPU::G_BUILD_VECTOR:
+ case AMDGPU::G_BUILD_VECTOR_TRUNC: {
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ if (DstTy == LLT::vector(2, 16)) {
+ unsigned DstSize = DstTy.getSizeInBits();
+ unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
+ break;
+ }
+
+ LLVM_FALLTHROUGH;
+ }
+ case AMDGPU::G_MERGE_VALUES:
case AMDGPU::G_CONCAT_VECTORS: {
unsigned Bank = isSALUMapping(MI) ?
AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
@@ -1760,6 +2402,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_CTTZ_ZERO_UNDEF:
case AMDGPU::G_CTPOP:
case AMDGPU::G_BSWAP:
+ case AMDGPU::G_BITREVERSE:
case AMDGPU::G_FABS:
case AMDGPU::G_FNEG: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -1848,7 +2491,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
Op3Bank == AMDGPU::SGPRRegBankID &&
(Size == 32 || (Size == 64 &&
(Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
- MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64()));
+ Subtarget.hasScalarCompareEq64()));
unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
@@ -1859,14 +2502,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_EXTRACT_VECTOR_ELT: {
- unsigned OutputBankID = isSALUMapping(MI) ?
- AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+ // VGPR index can be used for waterfall when indexing a SGPR vector.
+ unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
- OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
- OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
+ OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
// The index can be either if the source vector is VGPR.
OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
@@ -1879,15 +2524,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
- unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
- unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+ unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
+ MRI, *TRI);
+ unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
- OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
- OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize);
+ OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID,
+ InsertSize);
// The index can be either if the source vector is VGPR.
- OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
+ OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
break;
}
case AMDGPU::G_UNMERGE_VALUES: {
@@ -1903,11 +2551,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_INTRINSIC: {
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ switch (MI.getIntrinsicID()) {
default:
return getInvalidInstructionMapping();
- case Intrinsic::maxnum:
- case Intrinsic::minnum:
case Intrinsic::amdgcn_div_fmas:
case Intrinsic::amdgcn_trig_preop:
case Intrinsic::amdgcn_sin:
@@ -1938,6 +2584,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mbcnt_hi:
case Intrinsic::amdgcn_ubfe:
case Intrinsic::amdgcn_sbfe:
+ case Intrinsic::amdgcn_mul_u24:
+ case Intrinsic::amdgcn_mul_i24:
case Intrinsic::amdgcn_lerp:
case Intrinsic::amdgcn_sad_u8:
case Intrinsic::amdgcn_msad_u8:
@@ -1956,10 +2604,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_udot4:
case Intrinsic::amdgcn_sdot8:
case Intrinsic::amdgcn_udot8:
- case Intrinsic::amdgcn_fdiv_fast:
case Intrinsic::amdgcn_wwm:
case Intrinsic::amdgcn_wqm:
return getDefaultMappingVOP(MI);
+ case Intrinsic::amdgcn_ds_swizzle:
case Intrinsic::amdgcn_ds_permute:
case Intrinsic::amdgcn_ds_bpermute:
case Intrinsic::amdgcn_update_dpp:
@@ -2040,7 +2688,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_readlane: {
// This must be an SGPR, but accept a VGPR.
- unsigned IdxReg = MI.getOperand(3).getReg();
+ Register IdxReg = MI.getOperand(3).getReg();
unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
@@ -2055,10 +2703,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_writelane: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- unsigned SrcReg = MI.getOperand(2).getReg();
+ Register SrcReg = MI.getOperand(2).getReg();
unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
- unsigned IdxReg = MI.getOperand(3).getReg();
+ Register IdxReg = MI.getOperand(3).getReg();
unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
@@ -2081,9 +2729,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
- default:
- return getInvalidInstructionMapping();
+ auto IntrID = MI.getIntrinsicID();
+ switch (IntrID) {
case Intrinsic::amdgcn_s_getreg:
case Intrinsic::amdgcn_s_memtime:
case Intrinsic::amdgcn_s_memrealtime:
@@ -2123,18 +2770,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
break;
case Intrinsic::amdgcn_exp:
- OpdsMapping[0] = nullptr; // IntrinsicID
- // FIXME: These are immediate values which can't be read from registers.
- OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
- OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
// FIXME: Could we support packed types here?
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
- // FIXME: These are immediate values which can't be read from registers.
- OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
- OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
break;
case Intrinsic::amdgcn_buffer_load: {
Register RSrc = MI.getOperand(2).getReg(); // SGPR
@@ -2169,11 +2809,97 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
break;
}
- case Intrinsic::amdgcn_end_cf: {
+ case Intrinsic::amdgcn_end_cf:
+ case Intrinsic::amdgcn_init_exec: {
+ unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
+ case Intrinsic::amdgcn_else: {
+ unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
+ break;
+ }
+ case Intrinsic::amdgcn_kill: {
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ break;
+ }
+ case Intrinsic::amdgcn_raw_buffer_load:
+ case Intrinsic::amdgcn_raw_tbuffer_load: {
+ // FIXME: Should make intrinsic ID the last operand of the instruction,
+ // then this would be the same as store
+ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_raw_buffer_store:
+ case Intrinsic::amdgcn_raw_buffer_store_format:
+ case Intrinsic::amdgcn_raw_tbuffer_store: {
+ OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_struct_buffer_load:
+ case Intrinsic::amdgcn_struct_tbuffer_load: {
+ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_struct_buffer_store:
+ case Intrinsic::amdgcn_struct_tbuffer_store: {
+ OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_init_exec_from_input: {
unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
+ case Intrinsic::amdgcn_ds_gws_init:
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ case Intrinsic::amdgcn_ds_gws_sema_br: {
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+
+ // This must be an SGPR, but accept a VGPR.
+ unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ case Intrinsic::amdgcn_ds_gws_sema_release_all: {
+ // This must be an SGPR, but accept a VGPR.
+ unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
+ break;
+ }
+ default:
+ if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
+ AMDGPU::lookupRsrcIntrinsic(IntrID)) {
+ // Non-images can have complications from operands that allow both SGPR
+ // and VGPR. For now it's too complicated to figure out the final opcode
+ // to derive the register bank from the MCInstrDesc.
+ if (RSrcIntrin->IsImage)
+ return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
+ }
+
+ return getInvalidInstructionMapping();
}
break;
}
@@ -2216,6 +2942,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case AMDGPU::G_LOAD:
+ case AMDGPU::G_ZEXTLOAD:
+ case AMDGPU::G_SEXTLOAD:
return getInstrMappingForLoad(MI);
case AMDGPU::G_ATOMICRMW_XCHG:
@@ -2228,6 +2956,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_MIN:
case AMDGPU::G_ATOMICRMW_UMAX:
case AMDGPU::G_ATOMICRMW_UMIN:
+ case AMDGPU::G_ATOMICRMW_FADD:
case AMDGPU::G_ATOMIC_CMPXCHG: {
return getDefaultMappingAllVGPR(MI);
}
@@ -2247,4 +2976,3 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
getOperandsMapping(OpdsMapping),
MI.getNumOperands());
}
-
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index f3a96e2a6128..a14b74961118 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -13,6 +13,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
@@ -23,7 +25,9 @@
namespace llvm {
class LLT;
+class GCNSubtarget;
class MachineIRBuilder;
+class SIInstrInfo;
class SIRegisterInfo;
class TargetRegisterInfo;
@@ -36,9 +40,27 @@ protected:
#include "AMDGPUGenRegisterBank.inc"
};
class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
+ const GCNSubtarget &Subtarget;
const SIRegisterInfo *TRI;
-
- void executeInWaterfallLoop(MachineInstr &MI,
+ const SIInstrInfo *TII;
+
+ bool collectWaterfallOperands(
+ SmallSet<Register, 4> &SGPROperandRegs,
+ MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const;
+
+ bool executeInWaterfallLoop(
+ MachineIRBuilder &B,
+ iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs,
+ MachineRegisterInfo &MRI) const;
+
+ bool executeInWaterfallLoop(MachineIRBuilder &B,
+ MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const;
+ bool executeInWaterfallLoop(MachineInstr &MI,
MachineRegisterInfo &MRI,
ArrayRef<unsigned> OpIndices) const;
@@ -47,6 +69,19 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
bool applyMappingWideLoad(MachineInstr &MI,
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI) const;
+ bool
+ applyMappingImage(MachineInstr &MI,
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI, int RSrcIdx) const;
+
+ Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ Register Reg) const;
+
+ std::pair<Register, unsigned>
+ splitBufferOffsets(MachineIRBuilder &B, Register Offset) const;
+
+ MachineInstr *selectStoreIntrinsic(MachineIRBuilder &B,
+ MachineInstr &MI) const;
/// See RegisterBankInfo::applyMapping.
void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
@@ -58,6 +93,16 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
const TargetRegisterInfo &TRI,
unsigned Default = AMDGPU::VGPRRegBankID) const;
+ // Return a value mapping for an operand that is required to be an SGPR.
+ const ValueMapping *getSGPROpMapping(Register Reg,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const;
+
+ // Return a value mapping for an operand that is required to be a VGPR.
+ const ValueMapping *getVGPROpMapping(Register Reg,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const;
+
/// Split 64-bit value \p Reg into two 32-bit halves and populate them into \p
/// Regs. This appropriately sets the regbank of the new registers.
void split64BitValueForMapping(MachineIRBuilder &B,
@@ -90,8 +135,13 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
const InstructionMapping &getDefaultMappingAllVGPR(
const MachineInstr &MI) const;
+
+ const InstructionMapping &getImageMapping(const MachineRegisterInfo &MRI,
+ const MachineInstr &MI,
+ int RsrcIdx) const;
+
public:
- AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI);
+ AMDGPURegisterBankInfo(const GCNSubtarget &STI);
unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
unsigned Size) const override;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 9555694fb106..00f53b157577 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -7,14 +7,14 @@
//===----------------------------------------------------------------------===//
def SGPRRegBank : RegisterBank<"SGPR",
- [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512]
+ [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512, SReg_1024]
>;
def VGPRRegBank : RegisterBank<"VGPR",
- [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
+ [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024]
>;
def SCCRegBank : RegisterBank <"SCC", [SReg_32, SCC_CLASS]>;
// It is helpful to distinguish conditions from ordinary SGPRs.
-def VCCRegBank : RegisterBank <"VCC", [SReg_64]>;
+def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 7cffdf1a4dcf..9806e6b0714f 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -26,19 +26,59 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
// they are not supported at this time.
//===----------------------------------------------------------------------===//
-unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) {
- static const unsigned SubRegs[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
- AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
- AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
- AMDGPU::sub15, AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
- AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, AMDGPU::sub24,
- AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, AMDGPU::sub28, AMDGPU::sub29,
- AMDGPU::sub30, AMDGPU::sub31
- };
-
- assert(Channel < array_lengthof(SubRegs));
- return SubRegs[Channel];
+// Table of NumRegs sized pieces at every 32-bit offset.
+static const uint16_t SubRegFromChannelTable[][32] = {
+ { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+ AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+ AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+ AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
+ AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
+ AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
+ AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31
+ },
+ {
+ AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3, AMDGPU::sub3_sub4,
+ AMDGPU::sub4_sub5, AMDGPU::sub5_sub6, AMDGPU::sub6_sub7, AMDGPU::sub7_sub8,
+ AMDGPU::sub8_sub9, AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12,
+ AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15, AMDGPU::sub15_sub16,
+ AMDGPU::sub16_sub17, AMDGPU::sub17_sub18, AMDGPU::sub18_sub19, AMDGPU::sub19_sub20,
+ AMDGPU::sub20_sub21, AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24,
+ AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27, AMDGPU::sub27_sub28,
+ AMDGPU::sub28_sub29, AMDGPU::sub29_sub30, AMDGPU::sub30_sub31, AMDGPU::NoSubRegister
+ },
+ {
+ AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5,
+ AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7, AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9,
+ AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11, AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13,
+ AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15, AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17,
+ AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19, AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21,
+ AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23, AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25,
+ AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27, AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29,
+ AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister
+ },
+ {
+ AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6,
+ AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8, AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10,
+ AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12, AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14,
+ AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16, AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18,
+ AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20, AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22,
+ AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24, AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26,
+ AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28, AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30,
+ AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister
+ }
+};
+
+// FIXME: TableGen should generate something to make this manageable for all
+// register classes. At a minimum we could use the opposite of
+// composeSubRegIndices and go up from the base 32-bit subreg.
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel, unsigned NumRegs) {
+ const unsigned NumRegIndex = NumRegs - 1;
+
+ assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) &&
+ "Not implemented");
+ assert(Channel < array_lengthof(SubRegFromChannelTable[0]));
+ return SubRegFromChannelTable[NumRegIndex][Channel];
}
void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index 3453a8c1b0b3..9e713ca804a1 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -28,7 +28,7 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
/// \returns the sub reg enum value for the given \p Channel
/// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
- static unsigned getSubRegFromChannel(unsigned Channel);
+ static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1);
void reserveRegisterTuples(BitVector &, unsigned Reg) const;
};
diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index f8703c36127a..26b8b7840270 100644
--- a/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -81,6 +81,8 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_umax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_and>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_or>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
@@ -92,6 +94,8 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_umax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_and>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_or>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 1eb9b83456c5..3bb6dd4571c0 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -175,6 +175,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
HasFminFmaxLegacy(true),
EnablePromoteAlloca(false),
HasTrigReducedRange(false),
+ MaxWavesPerEU(10),
LocalMemorySize(0),
WavefrontSize(0)
{ }
@@ -261,6 +262,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
AddNoCarryInsts(false),
HasUnpackedD16VMem(false),
LDSMisalignedBug(false),
+ HasMFMAInlineLiteralBug(false),
ScalarizeGlobal(false),
@@ -278,9 +280,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
TLInfo(TM, *this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
+ MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
- RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
+ RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
InstSelector.reset(new AMDGPUInstructionSelector(
*this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
}
@@ -489,28 +492,28 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
}
uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
- unsigned &MaxAlign) const {
+ Align &MaxAlign) const {
assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
F.getCallingConv() == CallingConv::SPIR_KERNEL);
const DataLayout &DL = F.getParent()->getDataLayout();
uint64_t ExplicitArgBytes = 0;
- MaxAlign = 1;
+ MaxAlign = Align::None();
for (const Argument &Arg : F.args()) {
Type *ArgTy = Arg.getType();
- unsigned Align = DL.getABITypeAlignment(ArgTy);
+ const Align Alignment(DL.getABITypeAlignment(ArgTy));
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
- ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
- MaxAlign = std::max(MaxAlign, Align);
+ ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
+ MaxAlign = std::max(MaxAlign, Alignment);
}
return ExplicitArgBytes;
}
unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
- unsigned &MaxAlign) const {
+ Align &MaxAlign) const {
uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
@@ -518,7 +521,7 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
unsigned ImplicitBytes = getImplicitArgNumBytes(F);
if (ImplicitBytes != 0) {
- unsigned Alignment = getAlignmentForImplicitArgPtr();
+ const Align Alignment = getAlignmentForImplicitArgPtr();
TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
}
@@ -566,7 +569,7 @@ bool GCNSubtarget::hasMadF16() const {
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
- return 10;
+ return getMaxWavesPerEU();
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
if (SGPRs <= 80)
@@ -591,25 +594,12 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
}
unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
- if (VGPRs <= 24)
- return 10;
- if (VGPRs <= 28)
- return 9;
- if (VGPRs <= 32)
- return 8;
- if (VGPRs <= 36)
- return 7;
- if (VGPRs <= 40)
- return 6;
- if (VGPRs <= 48)
- return 5;
- if (VGPRs <= 64)
- return 4;
- if (VGPRs <= 84)
- return 3;
- if (VGPRs <= 128)
- return 2;
- return 1;
+ unsigned MaxWaves = getMaxWavesPerEU();
+ unsigned Granule = getVGPRAllocGranule();
+ if (VGPRs < Granule)
+ return MaxWaves;
+ unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
+ return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
}
unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
@@ -629,6 +619,20 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
return 2; // VCC.
}
+unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
+ unsigned LDSSize,
+ unsigned NumSGPRs,
+ unsigned NumVGPRs) const {
+ unsigned Occupancy =
+ std::min(getMaxWavesPerEU(),
+ getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
+ if (NumSGPRs)
+ Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
+ if (NumVGPRs)
+ Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
+ return Occupancy;
+}
+
unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -878,8 +882,8 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
void GCNSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
- Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
- Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo));
+ Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
+ Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
}
const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 78c3b823946d..936feb00c62b 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -75,6 +75,7 @@ protected:
bool HasFminFmaxLegacy;
bool EnablePromoteAlloca;
bool HasTrigReducedRange;
+ unsigned MaxWavesPerEU;
int LocalMemorySize;
unsigned WavefrontSize;
@@ -195,8 +196,8 @@ public:
return LocalMemorySize;
}
- unsigned getAlignmentForImplicitArgPtr() const {
- return isAmdHsaOS() ? 8 : 4;
+ Align getAlignmentForImplicitArgPtr() const {
+ return isAmdHsaOS() ? Align(8) : Align(4);
}
/// Returns the offset in bytes from the start of the input buffer
@@ -223,7 +224,9 @@ public:
/// subtarget.
virtual unsigned getMinWavesPerEU() const = 0;
- unsigned getMaxWavesPerEU() const { return 10; }
+ /// \returns Maximum number of waves per execution unit supported by the
+ /// subtarget without any kind of limitation.
+ unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
/// Creates value range metadata on an workitemid.* inrinsic call or load.
bool makeLIDRangeMetadata(Instruction *I) const;
@@ -235,16 +238,17 @@ public:
return 16;
return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
}
- uint64_t getExplicitKernArgSize(const Function &F,
- unsigned &MaxAlign) const;
- unsigned getKernArgSegmentSize(const Function &F,
- unsigned &MaxAlign) const;
+ uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
+ unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
virtual ~AMDGPUSubtarget() {}
};
class GCNSubtarget : public AMDGPUGenSubtargetInfo,
public AMDGPUSubtarget {
+
+ using AMDGPUSubtarget::getMaxWavesPerEU;
+
public:
enum TrapHandlerAbi {
TrapHandlerAbiNone = 0,
@@ -362,6 +366,7 @@ protected:
bool CaymanISA;
bool CFALUBug;
bool LDSMisalignedBug;
+ bool HasMFMAInlineLiteralBug;
bool HasVertexCache;
short TexVTXClauseSize;
bool ScalarizeGlobal;
@@ -416,7 +421,7 @@ public:
return CallLoweringInfo.get();
}
- const InstructionSelector *getInstructionSelector() const override {
+ InstructionSelector *getInstructionSelector() const override {
return InstSelector.get();
}
@@ -544,6 +549,14 @@ public:
return GFX9Insts;
}
+ bool hasScalarPackInsts() const {
+ return GFX9Insts;
+ }
+
+ bool hasScalarMulHiInsts() const {
+ return GFX9Insts;
+ }
+
TrapHandlerAbi getTrapHandlerAbi() const {
return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
}
@@ -611,6 +624,11 @@ public:
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
+ /// \returns If target supports S_DENORM_MODE.
+ bool hasDenormModeInst() const {
+ return getGeneration() >= AMDGPUSubtarget::GFX10;
+ }
+
bool useFlatForGlobal() const {
return FlatForGlobal;
}
@@ -848,9 +866,7 @@ public:
// on the pointer value itself may rely on the alignment / known low bits of
// the pointer. Set this to something above the minimum to avoid needing
// dynamic realignment in common cases.
- unsigned getStackAlignment() const {
- return 16;
- }
+ Align getStackAlignment() const { return Align(16); }
bool enableMachineScheduler() const override {
return true;
@@ -881,12 +897,6 @@ public:
return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
}
- /// \returns Maximum number of waves per execution unit supported by the
- /// subtarget without any kind of limitation.
- unsigned getMaxWavesPerEU() const {
- return AMDGPU::IsaInfo::getMaxWavesPerEU();
- }
-
/// \returns Number of waves per work group supported by the subtarget and
/// limited by given \p FlatWorkGroupSize.
unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
@@ -944,6 +954,14 @@ public:
return HasDPP;
}
+ bool hasDPPBroadcasts() const {
+ return HasDPP && getGeneration() < GFX10;
+ }
+
+ bool hasDPPWavefrontShifts() const {
+ return HasDPP && getGeneration() < GFX10;
+ }
+
bool hasDPP8() const {
return HasDPP8;
}
@@ -974,6 +992,10 @@ public:
return SGPRInitBug;
}
+ bool hasMFMAInlineLiteralBug() const {
+ return HasMFMAInlineLiteralBug;
+ }
+
bool has12DWordStoreHazard() const {
return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
}
@@ -1036,6 +1058,13 @@ public:
/// VGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
+ /// Return occupancy for the given function. Used LDS and a number of
+ /// registers if provided.
+ /// Note, occupancy can be affected by the scratch allocation as well, but
+ /// we do not have enough information to compute it.
+ unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0,
+ unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
+
/// \returns true if the flat_scratch register should be initialized with the
/// pointer to the wave's scratch memory rather than a size and offset.
bool flatScratchIsPointer() const {
@@ -1226,9 +1255,7 @@ public:
return Gen;
}
- unsigned getStackAlignment() const {
- return 4;
- }
+ Align getStackAlignment() const { return Align(4); }
R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS);
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0ea8db04c298..e8cf77161a14 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -238,16 +238,17 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUUseNativeCallsPass(*PR);
initializeAMDGPUSimplifyLibCallsPass(*PR);
initializeAMDGPUInlinerPass(*PR);
+ initializeAMDGPUPrintfRuntimeBindingPass(*PR);
initializeGCNRegBankReassignPass(*PR);
initializeGCNNSAReassignPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
- return llvm::make_unique<AMDGPUTargetObjectFile>();
+ return std::make_unique<AMDGPUTargetObjectFile>();
}
static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
- return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
+ return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
}
static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
@@ -257,7 +258,7 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
- new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
+ new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
@@ -412,6 +413,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
PM.add(createAMDGPUExternalAAWrapperPass());
}
PM.add(createAMDGPUUnifyMetadataPass());
+ PM.add(createAMDGPUPrintfRuntimeBinding());
PM.add(createAMDGPUPropagateAttributesLatePass(this));
if (Internalize) {
PM.add(createInternalizePass(mustPreserveGV));
@@ -482,7 +484,7 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
+ I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
}
return I.get();
@@ -518,7 +520,7 @@ const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
+ I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
}
I->setScalarizeGlobalBehavior(ScalarizeGlobal);
@@ -659,6 +661,8 @@ void AMDGPUPassConfig::addIRPasses() {
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
+ addPass(createAMDGPUPrintfRuntimeBinding());
+
// This must occur before inlining, as the inliner will not look through
// bitcast calls.
addPass(createAMDGPUFixFunctionBitcastsPass());
@@ -681,12 +685,6 @@ void AMDGPUPassConfig::addIRPasses() {
// without ever running any passes on the second.
addPass(createBarrierNoopPass());
- if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
- // TODO: May want to move later or split into an early and late one.
-
- addPass(createAMDGPUCodeGenPreparePass());
- }
-
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
if (TM.getTargetTriple().getArch() == Triple::r600)
addPass(createR600OpenCLImageTypeLoweringPass());
@@ -714,6 +712,11 @@ void AMDGPUPassConfig::addIRPasses() {
}
}
+ if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+ // TODO: May want to move later or split into an early and late one.
+ addPass(createAMDGPUCodeGenPreparePass());
+ }
+
TargetPassConfig::addIRPasses();
// EarlyCSE is not always strong enough to clean up what LSR produces. For
@@ -1046,7 +1049,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
return true;
if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
- !AMDGPU::SReg_128RegClass.contains(MFI->ScratchRSrcReg)) {
+ !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
}
@@ -1095,7 +1098,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
if (YamlMFI.ArgInfo &&
(parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
- AMDGPU::SReg_128RegClass,
+ AMDGPU::SGPR_128RegClass,
MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index aaed280a1270..616196ad5ba3 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -57,7 +57,7 @@ using namespace llvm;
static cl::opt<unsigned> UnrollThresholdPrivate(
"amdgpu-unroll-threshold-private",
cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
- cl::init(2500), cl::Hidden);
+ cl::init(2000), cl::Hidden);
static cl::opt<unsigned> UnrollThresholdLocal(
"amdgpu-unroll-threshold-local",
@@ -590,6 +590,61 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
return false;
}
+bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
+ Intrinsic::ID IID) const {
+ switch (IID) {
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax:
+ case Intrinsic::amdgcn_is_shared:
+ case Intrinsic::amdgcn_is_private:
+ OpIndexes.push_back(0);
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
+ IntrinsicInst *II, Value *OldV, Value *NewV) const {
+ auto IntrID = II->getIntrinsicID();
+ switch (IntrID) {
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
+ const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
+ if (!IsVolatile->isZero())
+ return false;
+ Module *M = II->getParent()->getParent()->getParent();
+ Type *DestTy = II->getType();
+ Type *SrcTy = NewV->getType();
+ Function *NewDecl =
+ Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
+ II->setArgOperand(0, NewV);
+ II->setCalledFunction(NewDecl);
+ return true;
+ }
+ case Intrinsic::amdgcn_is_shared:
+ case Intrinsic::amdgcn_is_private: {
+ unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
+ AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
+ unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+ LLVMContext &Ctx = NewV->getType()->getContext();
+ ConstantInt *NewVal = (TrueAS == NewAS) ?
+ ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
+ II->replaceAllUsesWith(NewVal);
+ II->eraseFromParent();
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
if (ST->hasVOP3PInsts()) {
@@ -638,6 +693,39 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
CommonTTI.getUnrollingPreferences(L, SE, UP);
}
+unsigned GCNTTIImpl::getUserCost(const User *U,
+ ArrayRef<const Value *> Operands) {
+ // Estimate extractelement elimination
+ if (const ExtractElementInst *EE = dyn_cast<ExtractElementInst>(U)) {
+ ConstantInt *CI = dyn_cast<ConstantInt>(EE->getOperand(1));
+ unsigned Idx = -1;
+ if (CI)
+ Idx = CI->getZExtValue();
+ return getVectorInstrCost(EE->getOpcode(), EE->getOperand(0)->getType(),
+ Idx);
+ }
+
+ // Estimate insertelement elimination
+ if (const InsertElementInst *IE = dyn_cast<InsertElementInst>(U)) {
+ ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
+ unsigned Idx = -1;
+ if (CI)
+ Idx = CI->getZExtValue();
+ return getVectorInstrCost(IE->getOpcode(), IE->getType(), Idx);
+ }
+
+ // Estimate different intrinsics, e.g. llvm.fabs
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+ SmallVector<Value *, 4> Args(II->arg_operands());
+ FastMathFlags FMF;
+ if (auto *FPMO = dyn_cast<FPMathOperator>(II))
+ FMF = FPMO->getFastMathFlags();
+ return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
+ FMF);
+ }
+ return BaseT::getUserCost(U, Operands);
+}
+
unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 6f1bf5a26f0d..67f7f9074f10 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -46,10 +46,18 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
Triple TargetTriple;
+ const TargetSubtargetInfo *ST;
+ const TargetLoweringBase *TLI;
+
+ const TargetSubtargetInfo *getST() const { return ST; }
+ const TargetLoweringBase *getTLI() const { return TLI; }
+
public:
explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
- : BaseT(TM, F.getParent()->getDataLayout()),
- TargetTriple(TM->getTargetTriple()) {}
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ TargetTriple(TM->getTargetTriple()),
+ ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
+ TLI(ST->getTargetLowering()) {}
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
@@ -183,6 +191,11 @@ public:
return AMDGPUAS::FLAT_ADDRESS;
}
+ bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
+ Intrinsic::ID IID) const;
+ bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
+ Value *OldV, Value *NewV) const;
+
unsigned getVectorSplitCost() { return 0; }
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
@@ -191,7 +204,7 @@ public:
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
- unsigned getInliningThresholdMultiplier() { return 7; }
+ unsigned getInliningThresholdMultiplier() { return 9; }
int getInlinerVectorBonusPercent() { return 0; }
@@ -201,6 +214,7 @@ public:
int getMinMaxReductionCost(Type *Ty, Type *CondTy,
bool IsPairwiseForm,
bool IsUnsigned);
+ unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
};
class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 12f2e9519c9e..101ecfc0c87c 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1307,8 +1307,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
if (LandBlkHasOtherPred) {
report_fatal_error("Extra register needed to handle CFG");
- unsigned CmpResReg =
- HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+ Register CmpResReg =
+ HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
report_fatal_error("Extra compare instruction needed to handle CFG");
insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET,
CmpResReg, DebugLoc());
@@ -1316,8 +1316,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
// XXX: We are running this after RA, so creating virtual registers will
// cause an assertion failure in the PostRA scheduling pass.
- unsigned InitReg =
- HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+ Register InitReg =
+ HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg,
DebugLoc());
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 6d678966c98e..9dd511fab57c 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -143,6 +143,7 @@ public:
ImmTyDLC,
ImmTyGLC,
ImmTySLC,
+ ImmTySWZ,
ImmTyTFE,
ImmTyD16,
ImmTyClampSI,
@@ -216,14 +217,15 @@ public:
if (Kind == Token)
return true;
- if (Kind != Expression || !Expr)
- return false;
-
// When parsing operands, we can't always tell if something was meant to be
// a token, like 'gds', or an expression that references a global variable.
// In this case, we assume the string is an expression, and if we need to
// interpret is a token, then we treat the symbol name as the token.
- return isa<MCSymbolRefExpr>(Expr);
+ return isSymbolRefExpr();
+ }
+
+ bool isSymbolRefExpr() const {
+ return isExpr() && Expr && isa<MCSymbolRefExpr>(Expr);
}
bool isImm() const override {
@@ -274,8 +276,10 @@ public:
isRegClass(AMDGPU::VReg_64RegClassID) ||
isRegClass(AMDGPU::VReg_96RegClassID) ||
isRegClass(AMDGPU::VReg_128RegClassID) ||
+ isRegClass(AMDGPU::VReg_160RegClassID) ||
isRegClass(AMDGPU::VReg_256RegClassID) ||
- isRegClass(AMDGPU::VReg_512RegClassID);
+ isRegClass(AMDGPU::VReg_512RegClassID) ||
+ isRegClass(AMDGPU::VReg_1024RegClassID);
}
bool isVReg32() const {
@@ -286,6 +290,10 @@ public:
return isOff() || isVReg32();
}
+ bool isNull() const {
+ return isRegKind() && getReg() == AMDGPU::SGPR_NULL;
+ }
+
bool isSDWAOperand(MVT type) const;
bool isSDWAFP16Operand() const;
bool isSDWAFP32Operand() const;
@@ -325,6 +333,7 @@ public:
bool isDLC() const { return isImmTy(ImmTyDLC); }
bool isGLC() const { return isImmTy(ImmTyGLC); }
bool isSLC() const { return isImmTy(ImmTySLC); }
+ bool isSWZ() const { return isImmTy(ImmTySWZ); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isD16() const { return isImmTy(ImmTyD16); }
bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); }
@@ -817,6 +826,7 @@ public:
case ImmTyDLC: OS << "DLC"; break;
case ImmTyGLC: OS << "GLC"; break;
case ImmTySLC: OS << "SLC"; break;
+ case ImmTySWZ: OS << "SWZ"; break;
case ImmTyTFE: OS << "TFE"; break;
case ImmTyD16: OS << "D16"; break;
case ImmTyFORMAT: OS << "FORMAT"; break;
@@ -886,7 +896,7 @@ public:
int64_t Val, SMLoc Loc,
ImmTy Type = ImmTyNone,
bool IsFPImm = false) {
- auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser);
+ auto Op = std::make_unique<AMDGPUOperand>(Immediate, AsmParser);
Op->Imm.Val = Val;
Op->Imm.IsFPImm = IsFPImm;
Op->Imm.Type = Type;
@@ -899,7 +909,7 @@ public:
static AMDGPUOperand::Ptr CreateToken(const AMDGPUAsmParser *AsmParser,
StringRef Str, SMLoc Loc,
bool HasExplicitEncodingSize = true) {
- auto Res = llvm::make_unique<AMDGPUOperand>(Token, AsmParser);
+ auto Res = std::make_unique<AMDGPUOperand>(Token, AsmParser);
Res->Tok.Data = Str.data();
Res->Tok.Length = Str.size();
Res->StartLoc = Loc;
@@ -910,7 +920,7 @@ public:
static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser,
unsigned RegNo, SMLoc S,
SMLoc E) {
- auto Op = llvm::make_unique<AMDGPUOperand>(Register, AsmParser);
+ auto Op = std::make_unique<AMDGPUOperand>(Register, AsmParser);
Op->Reg.RegNo = RegNo;
Op->Reg.Mods = Modifiers();
Op->StartLoc = S;
@@ -920,7 +930,7 @@ public:
static AMDGPUOperand::Ptr CreateExpr(const AMDGPUAsmParser *AsmParser,
const class MCExpr *Expr, SMLoc S) {
- auto Op = llvm::make_unique<AMDGPUOperand>(Expression, AsmParser);
+ auto Op = std::make_unique<AMDGPUOperand>(Expression, AsmParser);
Op->Expr = Expr;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -1051,11 +1061,23 @@ private:
std::string &CollectString);
bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
- RegisterKind RegKind, unsigned Reg1,
- unsigned RegNum);
+ RegisterKind RegKind, unsigned Reg1);
bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
- unsigned& RegNum, unsigned& RegWidth,
- unsigned *DwordRegIndex);
+ unsigned& RegNum, unsigned& RegWidth);
+ unsigned ParseRegularReg(RegisterKind &RegKind,
+ unsigned &RegNum,
+ unsigned &RegWidth);
+ unsigned ParseSpecialReg(RegisterKind &RegKind,
+ unsigned &RegNum,
+ unsigned &RegWidth);
+ unsigned ParseRegList(RegisterKind &RegKind,
+ unsigned &RegNum,
+ unsigned &RegWidth);
+ bool ParseRegRange(unsigned& Num, unsigned& Width);
+ unsigned getRegularReg(RegisterKind RegKind,
+ unsigned RegNum,
+ unsigned RegWidth);
+
bool isRegister();
bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const;
Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind);
@@ -1306,6 +1328,7 @@ private:
bool validateOpSel(const MCInst &Inst);
bool validateVccOperand(unsigned Reg) const;
bool validateVOP3Literal(const MCInst &Inst) const;
+ unsigned getConstantBusLimit(unsigned Opcode) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
@@ -1321,6 +1344,7 @@ private:
void peekTokens(MutableArrayRef<AsmToken> Tokens);
AsmToken::TokenKind getTokenKind() const;
bool parseExpr(int64_t &Imm);
+ bool parseExpr(OperandVector &Operands);
StringRef getTokenStr() const;
AsmToken peekToken();
AsmToken getToken() const;
@@ -1399,9 +1423,12 @@ public:
void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);
void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands);
void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands);
+ void cvtSdwaVOP2e(MCInst &Inst, const OperandVector &Operands);
void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);
void cvtSDWA(MCInst &Inst, const OperandVector &Operands,
- uint64_t BasicInstType, bool skipVcc = false);
+ uint64_t BasicInstType,
+ bool SkipDstVcc = false,
+ bool SkipSrcVcc = false);
AMDGPUOperand::Ptr defaultBLGP() const;
AMDGPUOperand::Ptr defaultCBSZ() const;
@@ -1636,8 +1663,8 @@ bool AMDGPUOperand::isSDWAInt32Operand() const {
}
bool AMDGPUOperand::isBoolReg() const {
- return AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ?
- isSCSrcB64() : isSCSrcB32();
+ return (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] && isSCSrcB64()) ||
+ (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize32] && isSCSrcB32());
}
uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
@@ -1849,6 +1876,8 @@ static bool isInlineValue(unsigned Reg) {
case AMDGPU::SRC_EXECZ:
case AMDGPU::SRC_SCC:
return true;
+ case AMDGPU::SGPR_NULL:
+ return true;
default:
return false;
}
@@ -1870,8 +1899,10 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 2: return AMDGPU::VReg_64RegClassID;
case 3: return AMDGPU::VReg_96RegClassID;
case 4: return AMDGPU::VReg_128RegClassID;
+ case 5: return AMDGPU::VReg_160RegClassID;
case 8: return AMDGPU::VReg_256RegClassID;
case 16: return AMDGPU::VReg_512RegClassID;
+ case 32: return AMDGPU::VReg_1024RegClassID;
}
} else if (Is == IS_TTMP) {
switch (RegWidth) {
@@ -1944,7 +1975,7 @@ static unsigned getSpecialRegForName(StringRef RegName) {
.Case("tba_lo", AMDGPU::TBA_LO)
.Case("tba_hi", AMDGPU::TBA_HI)
.Case("null", AMDGPU::SGPR_NULL)
- .Default(0);
+ .Default(AMDGPU::NoRegister);
}
bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
@@ -1959,8 +1990,7 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
}
bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
- RegisterKind RegKind, unsigned Reg1,
- unsigned RegNum) {
+ RegisterKind RegKind, unsigned Reg1) {
switch (RegKind) {
case IS_SPECIAL:
if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) {
@@ -2008,14 +2038,37 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
}
}
-static const StringRef Registers[] = {
- { "v" },
- { "s" },
- { "ttmp" },
- { "acc" },
- { "a" },
+struct RegInfo {
+ StringLiteral Name;
+ RegisterKind Kind;
+};
+
+static constexpr RegInfo RegularRegisters[] = {
+ {{"v"}, IS_VGPR},
+ {{"s"}, IS_SGPR},
+ {{"ttmp"}, IS_TTMP},
+ {{"acc"}, IS_AGPR},
+ {{"a"}, IS_AGPR},
};
+static bool isRegularReg(RegisterKind Kind) {
+ return Kind == IS_VGPR ||
+ Kind == IS_SGPR ||
+ Kind == IS_TTMP ||
+ Kind == IS_AGPR;
+}
+
+static const RegInfo* getRegularRegInfo(StringRef Str) {
+ for (const RegInfo &Reg : RegularRegisters)
+ if (Str.startswith(Reg.Name))
+ return &Reg;
+ return nullptr;
+}
+
+static bool getRegNum(StringRef Str, unsigned& Num) {
+ return !Str.getAsInteger(10, Num);
+}
+
bool
AMDGPUAsmParser::isRegister(const AsmToken &Token,
const AsmToken &NextToken) const {
@@ -2029,24 +2082,24 @@ AMDGPUAsmParser::isRegister(const AsmToken &Token,
// A single register like s0 or a range of registers like s[0:1]
- StringRef RegName = Token.getString();
-
- for (StringRef Reg : Registers) {
- if (RegName.startswith(Reg)) {
- if (Reg.size() < RegName.size()) {
- unsigned RegNum;
- // A single register with an index: rXX
- if (!RegName.substr(Reg.size()).getAsInteger(10, RegNum))
- return true;
- } else {
- // A range of registers: r[XX:YY].
- if (NextToken.is(AsmToken::LBrac))
- return true;
- }
+ StringRef Str = Token.getString();
+ const RegInfo *Reg = getRegularRegInfo(Str);
+ if (Reg) {
+ StringRef RegName = Reg->Name;
+ StringRef RegSuffix = Str.substr(RegName.size());
+ if (!RegSuffix.empty()) {
+ unsigned Num;
+ // A single register with an index: rXX
+ if (getRegNum(RegSuffix, Num))
+ return true;
+ } else {
+ // A range of registers: r[XX:YY].
+ if (NextToken.is(AsmToken::LBrac))
+ return true;
}
}
- return getSpecialRegForName(RegName);
+ return getSpecialRegForName(Str) != AMDGPU::NoRegister;
}
bool
@@ -2055,137 +2108,161 @@ AMDGPUAsmParser::isRegister()
return isRegister(getToken(), peekToken());
}
-bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
- unsigned &RegNum, unsigned &RegWidth,
- unsigned *DwordRegIndex) {
- if (DwordRegIndex) { *DwordRegIndex = 0; }
+unsigned
+AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
+ unsigned RegNum,
+ unsigned RegWidth) {
+
+ assert(isRegularReg(RegKind));
+
+ unsigned AlignSize = 1;
+ if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
+ // SGPR and TTMP registers must be aligned.
+ // Max required alignment is 4 dwords.
+ AlignSize = std::min(RegWidth, 4u);
+ }
+
+ if (RegNum % AlignSize != 0)
+ return AMDGPU::NoRegister;
+
+ unsigned RegIdx = RegNum / AlignSize;
+ int RCID = getRegClass(RegKind, RegWidth);
+ if (RCID == -1)
+ return AMDGPU::NoRegister;
+
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
- if (getLexer().is(AsmToken::Identifier)) {
- StringRef RegName = Parser.getTok().getString();
- if ((Reg = getSpecialRegForName(RegName))) {
- Parser.Lex();
- RegKind = IS_SPECIAL;
- } else {
- unsigned RegNumIndex = 0;
- if (RegName[0] == 'v') {
- RegNumIndex = 1;
- RegKind = IS_VGPR;
- } else if (RegName[0] == 's') {
- RegNumIndex = 1;
- RegKind = IS_SGPR;
- } else if (RegName[0] == 'a') {
- RegNumIndex = RegName.startswith("acc") ? 3 : 1;
- RegKind = IS_AGPR;
- } else if (RegName.startswith("ttmp")) {
- RegNumIndex = strlen("ttmp");
- RegKind = IS_TTMP;
- } else {
- return false;
- }
- if (RegName.size() > RegNumIndex) {
- // Single 32-bit register: vXX.
- if (RegName.substr(RegNumIndex).getAsInteger(10, RegNum))
- return false;
- Parser.Lex();
- RegWidth = 1;
- } else {
- // Range of registers: v[XX:YY]. ":YY" is optional.
- Parser.Lex();
- int64_t RegLo, RegHi;
- if (getLexer().isNot(AsmToken::LBrac))
- return false;
- Parser.Lex();
+ const MCRegisterClass RC = TRI->getRegClass(RCID);
+ if (RegIdx >= RC.getNumRegs())
+ return AMDGPU::NoRegister;
- if (getParser().parseAbsoluteExpression(RegLo))
- return false;
+ return RC.getRegister(RegIdx);
+}
- const bool isRBrace = getLexer().is(AsmToken::RBrac);
- if (!isRBrace && getLexer().isNot(AsmToken::Colon))
- return false;
- Parser.Lex();
+bool
+AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) {
+ int64_t RegLo, RegHi;
+ if (!trySkipToken(AsmToken::LBrac))
+ return false;
- if (isRBrace) {
- RegHi = RegLo;
- } else {
- if (getParser().parseAbsoluteExpression(RegHi))
- return false;
+ if (!parseExpr(RegLo))
+ return false;
- if (getLexer().isNot(AsmToken::RBrac))
- return false;
- Parser.Lex();
- }
- RegNum = (unsigned) RegLo;
- RegWidth = (RegHi - RegLo) + 1;
- }
- }
- } else if (getLexer().is(AsmToken::LBrac)) {
- // List of consecutive registers: [s0,s1,s2,s3]
- Parser.Lex();
- if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, nullptr))
- return false;
- if (RegWidth != 1)
+ if (trySkipToken(AsmToken::Colon)) {
+ if (!parseExpr(RegHi))
return false;
- RegisterKind RegKind1;
- unsigned Reg1, RegNum1, RegWidth1;
- do {
- if (getLexer().is(AsmToken::Comma)) {
- Parser.Lex();
- } else if (getLexer().is(AsmToken::RBrac)) {
- Parser.Lex();
- break;
- } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1, nullptr)) {
- if (RegWidth1 != 1) {
- return false;
- }
- if (RegKind1 != RegKind) {
- return false;
- }
- if (!AddNextRegisterToList(Reg, RegWidth, RegKind1, Reg1, RegNum1)) {
- return false;
- }
- } else {
- return false;
- }
- } while (true);
} else {
- return false;
+ RegHi = RegLo;
}
- switch (RegKind) {
- case IS_SPECIAL:
+
+ if (!trySkipToken(AsmToken::RBrac))
+ return false;
+
+ if (!isUInt<32>(RegLo) || !isUInt<32>(RegHi) || RegLo > RegHi)
+ return false;
+
+ Num = static_cast<unsigned>(RegLo);
+ Width = (RegHi - RegLo) + 1;
+ return true;
+}
+
+unsigned
+AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind,
+ unsigned &RegNum,
+ unsigned &RegWidth) {
+ assert(isToken(AsmToken::Identifier));
+ unsigned Reg = getSpecialRegForName(getTokenStr());
+ if (Reg) {
RegNum = 0;
RegWidth = 1;
- break;
- case IS_VGPR:
- case IS_SGPR:
- case IS_AGPR:
- case IS_TTMP:
- {
- unsigned Size = 1;
- if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
- // SGPR and TTMP registers must be aligned. Max required alignment is 4 dwords.
- Size = std::min(RegWidth, 4u);
- }
- if (RegNum % Size != 0)
- return false;
- if (DwordRegIndex) { *DwordRegIndex = RegNum; }
- RegNum = RegNum / Size;
- int RCID = getRegClass(RegKind, RegWidth);
- if (RCID == -1)
- return false;
- const MCRegisterClass RC = TRI->getRegClass(RCID);
- if (RegNum >= RC.getNumRegs())
- return false;
- Reg = RC.getRegister(RegNum);
- break;
+ RegKind = IS_SPECIAL;
+ lex(); // skip register name
+ }
+ return Reg;
+}
+
+unsigned
+AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
+ unsigned &RegNum,
+ unsigned &RegWidth) {
+ assert(isToken(AsmToken::Identifier));
+ StringRef RegName = getTokenStr();
+
+ const RegInfo *RI = getRegularRegInfo(RegName);
+ if (!RI)
+ return AMDGPU::NoRegister;
+ lex(); // skip register name
+
+ RegKind = RI->Kind;
+ StringRef RegSuffix = RegName.substr(RI->Name.size());
+ if (!RegSuffix.empty()) {
+ // Single 32-bit register: vXX.
+ if (!getRegNum(RegSuffix, RegNum))
+ return AMDGPU::NoRegister;
+ RegWidth = 1;
+ } else {
+ // Range of registers: v[XX:YY]. ":YY" is optional.
+ if (!ParseRegRange(RegNum, RegWidth))
+ return AMDGPU::NoRegister;
}
- default:
- llvm_unreachable("unexpected register kind");
+ return getRegularReg(RegKind, RegNum, RegWidth);
+}
+
+unsigned
+AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind,
+ unsigned &RegNum,
+ unsigned &RegWidth) {
+ unsigned Reg = AMDGPU::NoRegister;
+
+ if (!trySkipToken(AsmToken::LBrac))
+ return AMDGPU::NoRegister;
+
+ // List of consecutive registers, e.g.: [s0,s1,s2,s3]
+
+ if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth))
+ return AMDGPU::NoRegister;
+ if (RegWidth != 1)
+ return AMDGPU::NoRegister;
+
+ for (; trySkipToken(AsmToken::Comma); ) {
+ RegisterKind NextRegKind;
+ unsigned NextReg, NextRegNum, NextRegWidth;
+
+ if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth))
+ return AMDGPU::NoRegister;
+ if (NextRegWidth != 1)
+ return AMDGPU::NoRegister;
+ if (NextRegKind != RegKind)
+ return AMDGPU::NoRegister;
+ if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg))
+ return AMDGPU::NoRegister;
}
- if (!subtargetHasRegister(*TRI, Reg))
- return false;
- return true;
+ if (!trySkipToken(AsmToken::RBrac))
+ return AMDGPU::NoRegister;
+
+ if (isRegularReg(RegKind))
+ Reg = getRegularReg(RegKind, RegNum, RegWidth);
+
+ return Reg;
+}
+
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind,
+ unsigned &Reg,
+ unsigned &RegNum,
+ unsigned &RegWidth) {
+ Reg = AMDGPU::NoRegister;
+
+ if (isToken(AsmToken::Identifier)) {
+ Reg = ParseSpecialReg(RegKind, RegNum, RegWidth);
+ if (Reg == AMDGPU::NoRegister)
+ Reg = ParseRegularReg(RegKind, RegNum, RegWidth);
+ } else {
+ Reg = ParseRegList(RegKind, RegNum, RegWidth);
+ }
+
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ return Reg != AMDGPU::NoRegister && subtargetHasRegister(*TRI, Reg);
}
Optional<StringRef>
@@ -2241,18 +2318,18 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
SMLoc StartLoc = Tok.getLoc();
SMLoc EndLoc = Tok.getEndLoc();
RegisterKind RegKind;
- unsigned Reg, RegNum, RegWidth, DwordRegIndex;
+ unsigned Reg, RegNum, RegWidth;
- if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) {
+ if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
//FIXME: improve error messages (bug 41303).
Error(StartLoc, "not a valid operand.");
return nullptr;
}
if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
- if (!updateGprCountSymbols(RegKind, DwordRegIndex, RegWidth))
+ if (!updateGprCountSymbols(RegKind, RegNum, RegWidth))
return nullptr;
} else
- KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
+ KernelScope.usesRegister(RegKind, RegNum, RegWidth);
return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc);
}
@@ -2648,7 +2725,6 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
case AMDGPU::VCC_LO:
case AMDGPU::VCC_HI:
case AMDGPU::M0:
- case AMDGPU::SGPR_NULL:
return Reg;
default:
break;
@@ -2697,13 +2773,38 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
}
}
+unsigned AMDGPUAsmParser::getConstantBusLimit(unsigned Opcode) const {
+ if (!isGFX10())
+ return 1;
+
+ switch (Opcode) {
+ // 64-bit shift instructions can use only one scalar value input
+ case AMDGPU::V_LSHLREV_B64:
+ case AMDGPU::V_LSHLREV_B64_gfx10:
+ case AMDGPU::V_LSHL_B64:
+ case AMDGPU::V_LSHRREV_B64:
+ case AMDGPU::V_LSHRREV_B64_gfx10:
+ case AMDGPU::V_LSHR_B64:
+ case AMDGPU::V_ASHRREV_I64:
+ case AMDGPU::V_ASHRREV_I64_gfx10:
+ case AMDGPU::V_ASHR_I64:
+ return 1;
+ default:
+ return 2;
+ }
+}
+
bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
const MCOperand &MO = Inst.getOperand(OpIdx);
if (MO.isImm()) {
return !isInlineConstant(Inst, OpIdx);
+ } else if (MO.isReg()) {
+ auto Reg = MO.getReg();
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ return isSGPR(mc2PseudoReg(Reg), TRI) && Reg != SGPR_NULL;
+ } else {
+ return true;
}
- return !MO.isReg() ||
- isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo());
}
bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
@@ -2782,10 +2883,7 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
}
ConstantBusUseCount += NumLiterals;
- if (isGFX10())
- return ConstantBusUseCount <= 2;
-
- return ConstantBusUseCount <= 1;
+ return ConstantBusUseCount <= getConstantBusLimit(Opcode);
}
bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
@@ -3212,6 +3310,7 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
const int OpIndices[] = { Src0Idx, Src1Idx };
+ unsigned NumExprs = 0;
unsigned NumLiterals = 0;
uint32_t LiteralValue;
@@ -3219,19 +3318,21 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
if (OpIdx == -1) break;
const MCOperand &MO = Inst.getOperand(OpIdx);
- if (MO.isImm() &&
- // Exclude special imm operands (like that used by s_set_gpr_idx_on)
- AMDGPU::isSISrcOperand(Desc, OpIdx) &&
- !isInlineConstant(Inst, OpIdx)) {
- uint32_t Value = static_cast<uint32_t>(MO.getImm());
- if (NumLiterals == 0 || LiteralValue != Value) {
- LiteralValue = Value;
- ++NumLiterals;
+ // Exclude special imm operands (like that used by s_set_gpr_idx_on)
+ if (AMDGPU::isSISrcOperand(Desc, OpIdx)) {
+ if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
+ uint32_t Value = static_cast<uint32_t>(MO.getImm());
+ if (NumLiterals == 0 || LiteralValue != Value) {
+ LiteralValue = Value;
+ ++NumLiterals;
+ }
+ } else if (MO.isExpr()) {
+ ++NumExprs;
}
}
}
- return NumLiterals <= 1;
+ return NumLiterals + NumExprs <= 1;
}
bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
@@ -3267,6 +3368,7 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+ unsigned NumExprs = 0;
unsigned NumLiterals = 0;
uint32_t LiteralValue;
@@ -3274,17 +3376,26 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
if (OpIdx == -1) break;
const MCOperand &MO = Inst.getOperand(OpIdx);
- if (!MO.isImm() || !AMDGPU::isSISrcOperand(Desc, OpIdx))
+ if (!MO.isImm() && !MO.isExpr())
+ continue;
+ if (!AMDGPU::isSISrcOperand(Desc, OpIdx))
continue;
- if (!isInlineConstant(Inst, OpIdx)) {
+ if (OpIdx == Src2Idx && (Desc.TSFlags & SIInstrFlags::IsMAI) &&
+ getFeatureBits()[AMDGPU::FeatureMFMAInlineLiteralBug])
+ return false;
+
+ if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
uint32_t Value = static_cast<uint32_t>(MO.getImm());
if (NumLiterals == 0 || LiteralValue != Value) {
LiteralValue = Value;
++NumLiterals;
}
+ } else if (MO.isExpr()) {
+ ++NumExprs;
}
}
+ NumLiterals += NumExprs;
return !NumLiterals ||
(NumLiterals == 1 && getFeatureBits()[AMDGPU::FeatureVOP3Literal]);
@@ -3607,37 +3718,44 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
Val, ValRange);
- UserSGPRCount += 4;
+ if (Val)
+ UserSGPRCount += 4;
} else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
ValRange);
- UserSGPRCount += 2;
+ if (Val)
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
ValRange);
- UserSGPRCount += 2;
+ if (Val)
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
Val, ValRange);
- UserSGPRCount += 2;
+ if (Val)
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
ValRange);
- UserSGPRCount += 2;
+ if (Val)
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
ValRange);
- UserSGPRCount += 2;
+ if (Val)
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
Val, ValRange);
- UserSGPRCount += 1;
+ if (Val)
+ UserSGPRCount += 1;
} else if (ID == ".amdhsa_wavefront_size32") {
if (IVersion.Major < 10)
return getParser().Error(IDRange.Start, "directive requires gfx10+",
@@ -5225,6 +5343,23 @@ AMDGPUAsmParser::parseExpr(int64_t &Imm) {
}
bool
+AMDGPUAsmParser::parseExpr(OperandVector &Operands) {
+ SMLoc S = getLoc();
+
+ const MCExpr *Expr;
+ if (Parser.parseExpression(Expr))
+ return false;
+
+ int64_t IntVal;
+ if (Expr->evaluateAsAbsolute(IntVal)) {
+ Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S));
+ } else {
+ Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
+ }
+ return true;
+}
+
+bool
AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) {
if (isToken(AsmToken::String)) {
Val = getToken().getStringContents();
@@ -5605,25 +5740,29 @@ bool AMDGPUOperand::isGPRIdxMode() const {
OperandMatchResultTy
AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
- SMLoc S = Parser.getTok().getLoc();
- switch (getLexer().getKind()) {
- default: return MatchOperand_ParseFail;
- case AsmToken::Integer: {
- int64_t Imm;
- if (getParser().parseAbsoluteExpression(Imm))
- return MatchOperand_ParseFail;
- Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S));
- return MatchOperand_Success;
- }
+ // Make sure we are not parsing something
+ // that looks like a label or an expression but is not.
+ // This will improve error messages.
+ if (isRegister() || isModifier())
+ return MatchOperand_NoMatch;
- case AsmToken::Identifier:
- Operands.push_back(AMDGPUOperand::CreateExpr(this,
- MCSymbolRefExpr::create(getContext().getOrCreateSymbol(
- Parser.getTok().getString()), getContext()), S));
- Parser.Lex();
- return MatchOperand_Success;
+ if (parseExpr(Operands)) {
+
+ AMDGPUOperand &Opr = ((AMDGPUOperand &)*Operands[Operands.size() - 1]);
+ assert(Opr.isImm() || Opr.isExpr());
+ SMLoc Loc = Opr.getStartLoc();
+
+ // Currently we do not support arbitrary expressions as branch targets.
+ // Only labels and absolute expressions are accepted.
+ if (Opr.isExpr() && !Opr.isSymbolRefExpr()) {
+ Error(Loc, "expected an absolute expression or a label");
+ } else if (Opr.isImm() && !Opr.isS16Imm()) {
+ Error(Loc, "expected a 16-bit signed jump offset");
+ }
}
+
+ return MatchOperand_Success; // avoid excessive error messages
}
//===----------------------------------------------------------------------===//
@@ -5908,6 +6047,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr},
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
+ {"swz", AMDGPUOperand::ImmTySWZ, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
{"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
{"high", AMDGPUOperand::ImmTyHigh, true, nullptr},
@@ -5941,8 +6081,6 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
};
OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
- unsigned size = Operands.size();
- assert(size > 0);
OperandMatchResultTy res = parseOptionalOpr(Operands);
@@ -5957,17 +6095,13 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operan
// to make sure autogenerated parser of custom operands never hit hardcoded
// mandatory operands.
- if (size == 1 || ((AMDGPUOperand &)*Operands[size - 1]).isRegKind()) {
-
- // We have parsed the first optional operand.
- // Parse as many operands as necessary to skip all mandatory operands.
+ for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) {
+ if (res != MatchOperand_Success ||
+ isToken(AsmToken::EndOfStatement))
+ break;
- for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) {
- if (res != MatchOperand_Success ||
- getLexer().is(AsmToken::EndOfStatement)) break;
- if (getLexer().is(AsmToken::Comma)) Parser.Lex();
- res = parseOptionalOpr(Operands);
- }
+ trySkipToken(AsmToken::Comma);
+ res = parseOptionalOpr(Operands);
}
return res;
@@ -6682,7 +6816,11 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) {
}
void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) {
- cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true);
+ cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true, true);
+}
+
+void AMDGPUAsmParser::cvtSdwaVOP2e(MCInst &Inst, const OperandVector &Operands) {
+ cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, false, true);
}
void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
@@ -6690,11 +6828,14 @@ void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
}
void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
- uint64_t BasicInstType, bool skipVcc) {
+ uint64_t BasicInstType,
+ bool SkipDstVcc,
+ bool SkipSrcVcc) {
using namespace llvm::AMDGPU::SDWA;
OptionalImmIndexMap OptionalIdx;
- bool skippedVcc = false;
+ bool SkipVcc = SkipDstVcc || SkipSrcVcc;
+ bool SkippedVcc = false;
unsigned I = 1;
const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
@@ -6704,19 +6845,21 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
for (unsigned E = Operands.size(); I != E; ++I) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
- if (skipVcc && !skippedVcc && Op.isReg() &&
+ if (SkipVcc && !SkippedVcc && Op.isReg() &&
(Op.getReg() == AMDGPU::VCC || Op.getReg() == AMDGPU::VCC_LO)) {
// VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
// Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3)
// or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand.
// Skip VCC only if we didn't skip it on previous iteration.
+ // Note that src0 and src1 occupy 2 slots each because of modifiers.
if (BasicInstType == SIInstrFlags::VOP2 &&
- (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) {
- skippedVcc = true;
+ ((SkipDstVcc && Inst.getNumOperands() == 1) ||
+ (SkipSrcVcc && Inst.getNumOperands() == 5))) {
+ SkippedVcc = true;
continue;
} else if (BasicInstType == SIInstrFlags::VOPC &&
Inst.getNumOperands() == 0) {
- skippedVcc = true;
+ SkippedVcc = true;
continue;
}
}
@@ -6728,7 +6871,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
} else {
llvm_unreachable("Invalid operand type");
}
- skippedVcc = false;
+ SkippedVcc = false;
}
if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx10 &&
@@ -6849,6 +6992,14 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand;
case MCK_AttrChan:
return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand;
+ case MCK_SReg_64:
+ case MCK_SReg_64_XEXEC:
+ // Null is defined as a 32-bit register but
+ // it should also be enabled with 64-bit operands.
+ // The following code enables it for SReg_64 operands
+ // used as source and destination. Remaining source
+ // operands are handled in isInlinableImm.
+ return Operand.isNull() ? Match_Success : Match_InvalidOperand;
default:
return Match_InvalidOperand;
}
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 62a19d848af2..1b12550aed88 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -7,13 +7,13 @@
//===----------------------------------------------------------------------===//
def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
-def MUBUFAddr64 : ComplexPattern<i64, 8, "SelectMUBUFAddr64">;
+def MUBUFAddr64 : ComplexPattern<i64, 9, "SelectMUBUFAddr64">;
def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
-def MUBUFOffset : ComplexPattern<i64, 7, "SelectMUBUFOffset">;
+def MUBUFOffset : ComplexPattern<i64, 8, "SelectMUBUFOffset">;
def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
@@ -54,6 +54,17 @@ class MTBUFAddr64Table <bit is_addr64, string Name> {
// MTBUF classes
//===----------------------------------------------------------------------===//
+class MTBUFGetBaseOpcode<string Op> {
+ string ret = !subst("FORMAT_XY", "FORMAT_X",
+ !subst("FORMAT_XYZ", "FORMAT_X",
+ !subst("FORMAT_XYZW", "FORMAT_X", Op)));
+}
+
+class getMTBUFElements<string Op> {
+ int ret = 1;
+}
+
+
class MTBUF_Pseudo <string opName, dag outs, dag ins,
string asmOps, list<dag> pattern=[]> :
InstSI<outs, ins, "", pattern>,
@@ -67,6 +78,9 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
string Mnemonic = opName;
string AsmOperands = asmOps;
+ Instruction Opcode = !cast<Instruction>(NAME);
+ Instruction BaseOpcode = !cast<Instruction>(MTBUFGetBaseOpcode<NAME>.ret);
+
let VM_CNT = 1;
let EXP_CNT = 1;
let MTBUF = 1;
@@ -90,6 +104,7 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_offset = 1;
bits<1> has_slc = 1;
bits<1> has_tfe = 1;
+ bits<4> elements = 0;
}
class MTBUF_Real <MTBUF_Pseudo ps> :
@@ -126,17 +141,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc),
+ offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc)
+ offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz)
);
dag InsData = !if(!empty(vaddrList),
(ins vdataClass:$vdata, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
- SLC:$slc, TFE:$tfe, DLC:$dlc),
+ SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz),
(ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
- SLC:$slc, TFE:$tfe, DLC:$dlc)
+ SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz)
);
dag ret = !if(!empty(vdataList), InsNoData, InsData);
}
@@ -181,51 +196,54 @@ class MTBUF_SetupAddr<int addrKind> {
class MTBUF_Load_Pseudo <string opName,
int addrKind,
RegisterClass vdataClass,
+ int elems,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind>
: MTBUF_Pseudo<opName,
(outs vdataClass:$vdata),
getMTBUFIns<addrKindCopy>.ret,
- " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let mayLoad = 1;
let mayStore = 0;
+ let elements = elems;
}
multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
- ValueType load_vt = i32,
+ int elems, ValueType load_vt = i32,
SDPatternOperator ld = null_frag> {
- def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems,
[(set load_vt:$vdata,
(ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format,
- i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>,
+ i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>,
MTBUFAddr64Table<0, NAME>;
- def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems,
[(set load_vt:$vdata,
(ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
- i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>,
+ i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>,
MTBUFAddr64Table<1, NAME>;
- def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
+ def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
+ def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
- def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>;
+ def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
+ def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
+ def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
}
}
class MTBUF_Store_Pseudo <string opName,
int addrKind,
RegisterClass vdataClass,
+ int elems,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind,
@@ -233,39 +251,40 @@ class MTBUF_Store_Pseudo <string opName,
: MTBUF_Pseudo<opName,
(outs),
getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
- " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let mayLoad = 0;
let mayStore = 1;
+ let elements = elems;
}
multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
- ValueType store_vt = i32,
+ int elems, ValueType store_vt = i32,
SDPatternOperator st = null_frag> {
- def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i8:$format, i1:$glc,
- i1:$slc, i1:$tfe, i1:$dlc))]>,
+ i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
MTBUFAddr64Table<0, NAME>;
- def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i8:$format, i1:$glc,
- i1:$slc, i1:$tfe, i1:$dlc))]>,
+ i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
MTBUFAddr64Table<1, NAME>;
- def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
+ def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
+ def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
- def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>;
+ def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
+ def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
+ def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
}
}
@@ -320,7 +339,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_offset = 1;
bits<1> has_slc = 1;
bits<1> has_tfe = 1;
- bits<4> dwords = 0;
+ bits<4> elements = 0;
}
class MUBUF_Real <MUBUF_Pseudo ps> :
@@ -393,18 +412,30 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
);
dag ret = !con(
!if(!empty(vdataList), InsNoData, InsData),
- !if(isLds, (ins DLC:$dlc), (ins TFE:$tfe, DLC:$dlc))
+ !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz))
);
}
-class getMUBUFDwords<RegisterClass regClass> {
- string regClassAsInt = !cast<string>(regClass);
+class getMUBUFElements<ValueType vt> {
+ // eq does not support ValueType for some reason.
+ string vtAsStr = !cast<string>(vt);
+
int ret =
- !if(!eq(regClassAsInt, !cast<string>(VGPR_32)), 1,
- !if(!eq(regClassAsInt, !cast<string>(VReg_64)), 2,
- !if(!eq(regClassAsInt, !cast<string>(VReg_96)), 3,
- !if(!eq(regClassAsInt, !cast<string>(VReg_128)), 4,
- 0))));
+ !if(!eq(vtAsStr, "f16"), 1,
+ !if(!eq(vtAsStr, "v2f16"), 2,
+ !if(!eq(vtAsStr, "v3f16"), 3,
+ !if(!eq(vtAsStr, "v4f16"), 4,
+ !if(!eq(vt.Size, 32), 1,
+ !if(!eq(vt.Size, 64), 2,
+ !if(!eq(vt.Size, 96), 3,
+ !if(!eq(vt.Size, 128), 4, 0)
+ )
+ )
+ )
+ )
+ )
+ )
+ );
}
class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> {
@@ -442,18 +473,18 @@ class MUBUF_SetupAddr<int addrKind> {
class MUBUF_Load_Pseudo <string opName,
int addrKind,
- RegisterClass vdataClass,
+ ValueType vdata_vt,
bit HasTiedDest = 0,
bit isLds = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind>
: MUBUF_Pseudo<opName,
- (outs vdataClass:$vdata),
+ (outs getVregSrcForVT<vdata_vt>.ret:$vdata),
!con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
- !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
+ !if(HasTiedDest, (ins getVregSrcForVT<vdata_vt>.ret:$vdata_in), (ins))),
" $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
- !if(isLds, " lds", "$tfe") # "$dlc",
+ !if(isLds, " lds", "$tfe") # "$dlc" # "$swz",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # !if(isLds, "_lds", "") #
@@ -467,19 +498,19 @@ class MUBUF_Load_Pseudo <string opName,
let Uses = !if(isLds, [EXEC, M0], [EXEC]);
let has_tfe = !if(isLds, 0, 1);
let lds = isLds;
- let dwords = getMUBUFDwords<vdataClass>.ret;
+ let elements = getMUBUFElements<vdata_vt>.ret;
}
class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat <
- (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
- (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))
+ (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
+ (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))
>;
class MUBUF_Addr64_Load_Pat <Instruction inst,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag> : Pat <
- (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
- (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))
+ (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
+ (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))
>;
multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> {
@@ -490,89 +521,87 @@ multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPa
// FIXME: tfe can't be an operand because it requires a separate
// opcode because it needs an N+1 register class dest register.
-multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+multiclass MUBUF_Pseudo_Loads<string opName,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag,
bit TiedDest = 0,
bit isLds = 0> {
- def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>,
+ def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>,
MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
- def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, TiedDest, isLds>,
+ def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, load_vt, TiedDest, isLds>,
MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
- def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
- def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
- def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>;
- def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
- def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
- def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
+ def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>;
+ def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>;
+ def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>;
+ def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>;
}
}
-multiclass MUBUF_Pseudo_Loads_Lds<string opName, RegisterClass vdataClass,
- ValueType load_vt = i32,
+multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32,
SDPatternOperator ld_nolds = null_frag,
SDPatternOperator ld_lds = null_frag> {
- defm NAME : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_nolds>;
- defm _LDS : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_lds, 0, 1>;
+ defm NAME : MUBUF_Pseudo_Loads<opName, load_vt, ld_nolds>;
+ defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, ld_lds, 0, 1>;
}
class MUBUF_Store_Pseudo <string opName,
int addrKind,
- RegisterClass vdataClass,
+ ValueType store_vt,
list<dag> pattern=[],
// Workaround bug bz30254
- int addrKindCopy = addrKind,
- RegisterClass vdataClassCopy = vdataClass>
+ int addrKindCopy = addrKind>
: MUBUF_Pseudo<opName,
(outs),
- getMUBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
+ getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret]>.ret,
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let mayLoad = 0;
let mayStore = 1;
let maybeAtomic = 1;
- let dwords = getMUBUFDwords<vdataClass>.ret;
+ let elements = getMUBUFElements<store_vt>.ret;
}
-multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+multiclass MUBUF_Pseudo_Stores<string opName,
ValueType store_vt = i32,
SDPatternOperator st = null_frag> {
- def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
MUBUFAddr64Table<0, NAME>;
- def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, store_vt,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
MUBUFAddr64Table<1, NAME>;
- def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>;
+ def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>;
+ def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
- def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt>;
+ def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>;
+ def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>;
+ def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>;
}
}
class MUBUF_Pseudo_Store_Lds<string opName>
: MUBUF_Pseudo<opName,
(outs),
- (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
- " $srsrc, $soffset$offset lds$glc$slc"> {
+ (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz),
+ " $srsrc, $soffset$offset lds$glc$slc$swz"> {
let mayLoad = 0;
let mayStore = 1;
let maybeAtomic = 1;
@@ -686,7 +715,7 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
RegisterClass vdataClass,
ValueType vdataType,
SDPatternOperator atomic,
- bit isFP = getIsFP<vdataType>.ret> {
+ bit isFP = isFloatType<vdataType>.ret> {
let FPAtomic = isFP in
def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
MUBUFAddr64Table <0, NAME>;
@@ -710,7 +739,7 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
RegisterClass vdataClass,
ValueType vdataType,
SDPatternOperator atomic,
- bit isFP = getIsFP<vdataType>.ret> {
+ bit isFP = isFloatType<vdataType>.ret> {
let FPAtomic = isFP in
def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(set vdataType:$vdata,
@@ -748,107 +777,107 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
//===----------------------------------------------------------------------===//
defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_format_x", VGPR_32
+ "buffer_load_format_x", f32
>;
defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads <
- "buffer_load_format_xy", VReg_64
+ "buffer_load_format_xy", v2f32
>;
defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Pseudo_Loads <
- "buffer_load_format_xyz", VReg_96
+ "buffer_load_format_xyz", v3f32
>;
defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Pseudo_Loads <
- "buffer_load_format_xyzw", VReg_128
+ "buffer_load_format_xyzw", v4f32
>;
defm BUFFER_STORE_FORMAT_X : MUBUF_Pseudo_Stores <
- "buffer_store_format_x", VGPR_32
+ "buffer_store_format_x", f32
>;
defm BUFFER_STORE_FORMAT_XY : MUBUF_Pseudo_Stores <
- "buffer_store_format_xy", VReg_64
+ "buffer_store_format_xy", v2f32
>;
defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores <
- "buffer_store_format_xyz", VReg_96
+ "buffer_store_format_xyz", v3f32
>;
defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores <
- "buffer_store_format_xyzw", VReg_128
+ "buffer_store_format_xyzw", v4f32
>;
let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_x", VGPR_32
+ "buffer_load_format_d16_x", i32
>;
defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_xy", VReg_64
+ "buffer_load_format_d16_xy", v2i32
>;
defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_xyz", VReg_96
+ "buffer_load_format_d16_xyz", v3i32
>;
defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_xyzw", VReg_128
+ "buffer_load_format_d16_xyzw", v4i32
>;
defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_x", VGPR_32
+ "buffer_store_format_d16_x", i32
>;
defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_xy", VReg_64
+ "buffer_store_format_d16_xy", v2i32
>;
defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_xyz", VReg_96
+ "buffer_store_format_d16_xyz", v3i32
>;
defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_xyzw", VReg_128
+ "buffer_store_format_d16_xyzw", v4i32
>;
} // End HasUnpackedD16VMem.
let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_x", VGPR_32
+ "buffer_load_format_d16_x", f16
>;
defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_xy", VGPR_32
+ "buffer_load_format_d16_xy", v2f16
>;
defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_xyz", VReg_64
+ "buffer_load_format_d16_xyz", v3f16
>;
defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_xyzw", VReg_64
+ "buffer_load_format_d16_xyzw", v4f16
>;
defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_x", VGPR_32
+ "buffer_store_format_d16_x", f16
>;
defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_xy", VGPR_32
+ "buffer_store_format_d16_xy", v2f16
>;
defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_xyz", VReg_64
+ "buffer_store_format_d16_xyz", v3f16
>;
defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_xyzw", VReg_64
+ "buffer_store_format_d16_xyzw", v4f16
>;
} // End HasPackedD16VMem.
defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_ubyte", VGPR_32, i32
+ "buffer_load_ubyte", i32
>;
defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_sbyte", VGPR_32, i32
+ "buffer_load_sbyte", i32
>;
defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_ushort", VGPR_32, i32
+ "buffer_load_ushort", i32
>;
defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_sshort", VGPR_32, i32
+ "buffer_load_sshort", i32
>;
defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_dword", VGPR_32, i32
+ "buffer_load_dword", i32
>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx2", VReg_64, v2i32
+ "buffer_load_dwordx2", v2i32
>;
defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx3", VReg_96, v3i32
+ "buffer_load_dwordx3", v3i32
>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx4", VReg_128, v4i32
+ "buffer_load_dwordx4", v4i32
>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>;
@@ -867,111 +896,111 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;
// in at least GFX8+ chips. See Bug 37653.
let SubtargetPredicate = isGFX8GFX9 in {
defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1
+ "buffer_load_dwordx2", v2i32, null_frag, 0, 1
>;
defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx3", VReg_96, untyped, null_frag, 0, 1
+ "buffer_load_dwordx3", v3i32, null_frag, 0, 1
>;
defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx4", VReg_128, v4i32, null_frag, 0, 1
+ "buffer_load_dwordx4", v4i32, null_frag, 0, 1
>;
}
defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
- "buffer_store_byte", VGPR_32, i32, truncstorei8_global
+ "buffer_store_byte", i32, truncstorei8_global
>;
defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores <
- "buffer_store_short", VGPR_32, i32, truncstorei16_global
+ "buffer_store_short", i32, truncstorei16_global
>;
defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores <
- "buffer_store_dword", VGPR_32, i32, store_global
+ "buffer_store_dword", i32, store_global
>;
defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx2", VReg_64, v2i32, store_global
+ "buffer_store_dwordx2", v2i32, store_global
>;
defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx3", VReg_96, v3i32, store_global
+ "buffer_store_dwordx3", v3i32, store_global
>;
defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx4", VReg_128, v4i32, store_global
+ "buffer_store_dwordx4", v4i32, store_global
>;
defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
- "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
+ "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global_32
>;
defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics <
"buffer_atomic_cmpswap", VReg_64, v2i32, null_frag
>;
defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics <
- "buffer_atomic_add", VGPR_32, i32, atomic_add_global
+ "buffer_atomic_add", VGPR_32, i32, atomic_load_add_global_32
>;
defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics <
- "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
+ "buffer_atomic_sub", VGPR_32, i32, atomic_load_sub_global_32
>;
defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
+ "buffer_atomic_smin", VGPR_32, i32, atomic_load_min_global_32
>;
defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
+ "buffer_atomic_umin", VGPR_32, i32, atomic_load_umin_global_32
>;
defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
+ "buffer_atomic_smax", VGPR_32, i32, atomic_load_max_global_32
>;
defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
+ "buffer_atomic_umax", VGPR_32, i32, atomic_load_umax_global_32
>;
defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics <
- "buffer_atomic_and", VGPR_32, i32, atomic_and_global
+ "buffer_atomic_and", VGPR_32, i32, atomic_load_and_global_32
>;
defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics <
- "buffer_atomic_or", VGPR_32, i32, atomic_or_global
+ "buffer_atomic_or", VGPR_32, i32, atomic_load_or_global_32
>;
defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics <
- "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
+ "buffer_atomic_xor", VGPR_32, i32, atomic_load_xor_global_32
>;
defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics <
- "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global
+ "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global_32
>;
defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics <
- "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global
+ "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global_32
>;
defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global
+ "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global_64
>;
defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics <
"buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
>;
defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global
+ "buffer_atomic_add_x2", VReg_64, i64, atomic_load_add_global_64
>;
defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global
+ "buffer_atomic_sub_x2", VReg_64, i64, atomic_load_sub_global_64
>;
defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global
+ "buffer_atomic_smin_x2", VReg_64, i64, atomic_load_min_global_64
>;
defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global
+ "buffer_atomic_umin_x2", VReg_64, i64, atomic_load_umin_global_64
>;
defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global
+ "buffer_atomic_smax_x2", VReg_64, i64, atomic_load_max_global_64
>;
defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global
+ "buffer_atomic_umax_x2", VReg_64, i64, atomic_load_umax_global_64
>;
defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global
+ "buffer_atomic_and_x2", VReg_64, i64, atomic_load_and_global_64
>;
defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global
+ "buffer_atomic_or_x2", VReg_64, i64, atomic_load_or_global_64
>;
defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global
+ "buffer_atomic_xor_x2", VReg_64, i64, atomic_load_xor_global_64
>;
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global
+ "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global_64
>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
+ "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64
>;
let SubtargetPredicate = isGFX8GFX9 in {
@@ -981,58 +1010,75 @@ def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
let SubtargetPredicate = isGFX6 in { // isn't on CI & VI
/*
defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">;
-defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">;
-defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin">;
-defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax">;
defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub_x2">;
-defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap_x2">;
-defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin_x2">;
-defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax_x2">;
*/
def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
int_amdgcn_buffer_wbinvl1_sc>;
}
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+
+defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag
+>;
+defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_fmin", VGPR_32, f32, null_frag
+>;
+defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_fmax", VGPR_32, f32, null_frag
+>;
+defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
+>;
+defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_fmin_x2", VReg_64, f64, null_frag
+>;
+defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_fmax_x2", VReg_64, f64, null_frag
+>;
+
+}
+
let SubtargetPredicate = HasD16LoadStore in {
defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads <
- "buffer_load_ubyte_d16", VGPR_32, i32, null_frag, 1
+ "buffer_load_ubyte_d16", i32, null_frag, 1
>;
defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads <
- "buffer_load_ubyte_d16_hi", VGPR_32, i32, null_frag, 1
+ "buffer_load_ubyte_d16_hi", i32, null_frag, 1
>;
defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads <
- "buffer_load_sbyte_d16", VGPR_32, i32, null_frag, 1
+ "buffer_load_sbyte_d16", i32, null_frag, 1
>;
defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads <
- "buffer_load_sbyte_d16_hi", VGPR_32, i32, null_frag, 1
+ "buffer_load_sbyte_d16_hi", i32, null_frag, 1
>;
defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads <
- "buffer_load_short_d16", VGPR_32, i32, null_frag, 1
+ "buffer_load_short_d16", i32, null_frag, 1
>;
defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads <
- "buffer_load_short_d16_hi", VGPR_32, i32, null_frag, 1
+ "buffer_load_short_d16_hi", i32, null_frag, 1
>;
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores <
- "buffer_store_byte_d16_hi", VGPR_32, i32
+ "buffer_store_byte_d16_hi", i32
>;
defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
- "buffer_store_short_d16_hi", VGPR_32, i32
+ "buffer_store_short_d16_hi", i32
>;
defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_hi_x", VGPR_32
+ "buffer_load_format_d16_hi_x", i32
>;
defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_hi_x", VGPR_32
+ "buffer_store_format_d16_hi_x", i32
>;
} // End HasD16LoadStore
@@ -1043,10 +1089,10 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
let SubtargetPredicate = HasAtomicFaddInsts in {
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
- "buffer_atomic_add_f32", VGPR_32, f32, atomic_add_global
+ "buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
>;
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
- "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global
+ "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret
>;
} // End SubtargetPredicate = HasAtomicFaddInsts
@@ -1055,35 +1101,35 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
// MTBUF Instructions
//===----------------------------------------------------------------------===//
-defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>;
-defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>;
-defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>;
-defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>;
-defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>;
-defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>;
-defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32, 1>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64, 2>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96, 3>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128, 4>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32, 1>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64, 2>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96, 3>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128, 4>;
let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
- defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>;
- defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>;
- defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96>;
- defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>;
- defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>;
- defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>;
- defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96>;
- defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>;
+ defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64, 2>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96, 3>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128, 4>;
+ defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>;
+ defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64, 2>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96, 3>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128, 4>;
} // End HasUnpackedD16VMem.
let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
- defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>;
- defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>;
- defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>;
- defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>;
- defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>;
- defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>;
- defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>;
- defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>;
+ defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32, 2>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64, 3>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64, 4>;
+ defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>;
+ defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32, 2>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64, 3>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64, 4>;
} // End HasPackedD16VMem.
let SubtargetPredicate = isGFX7Plus in {
@@ -1118,6 +1164,10 @@ def extract_dlc : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
}]>;
+def extract_swz : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
+}]>;
+
//===----------------------------------------------------------------------===//
// buffer_load/store_format patterns
//===----------------------------------------------------------------------===//
@@ -1125,33 +1175,37 @@ def extract_dlc : SDNodeXForm<imm, [{
multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0)),
+ (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0)),
+ (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm)),
+ (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm)),
+ (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
}
@@ -1182,8 +1236,12 @@ let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
@@ -1196,36 +1254,40 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">;
multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (extract_glc $cachepolicy),
- (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (as_i16imm $offset), (extract_glc $auxiliary),
+ (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (extract_glc $cachepolicy),
- (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (as_i16imm $offset), (extract_glc $auxiliary),
+ (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (extract_glc $cachepolicy),
- (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ $rsrc, $soffset, (as_i16imm $offset), (extract_glc $auxiliary),
+ (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
}
@@ -1256,8 +1318,12 @@ let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
@@ -1273,32 +1339,32 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
(vt (name vt:$vdata_in, v4i32:$rsrc, 0,
- 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0)),
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm)),
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(vt (name vt:$vdata_in, v4i32:$rsrc, 0,
- i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0)),
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm)),
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
$vdata_in,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
@@ -1316,6 +1382,8 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i32, "BUFFER_ATOMIC_UMAX">;
defm : BufferAtomicPatterns<SIbuffer_atomic_and, i32, "BUFFER_ATOMIC_AND">;
defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">;
defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i32, "BUFFER_ATOMIC_INC">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i32, "BUFFER_ATOMIC_DEC">;
defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64, "BUFFER_ATOMIC_ADD_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">;
@@ -1326,37 +1394,39 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i64, "BUFFER_ATOMIC_UMAX_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_and, i64, "BUFFER_ATOMIC_AND_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_or, i64, "BUFFER_ATOMIC_OR_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">;
multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
(name vt:$vdata_in, v4i32:$rsrc, 0,
- 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata_in, v4i32:$rsrc, 0,
- i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
$vdata_in,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
@@ -1370,8 +1440,8 @@ defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMI
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, 0,
- 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
@@ -1382,8 +1452,8 @@ def : GCNPat<
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
- 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
@@ -1394,8 +1464,8 @@ def : GCNPat<
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, 0,
- i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
@@ -1406,8 +1476,8 @@ def : GCNPat<
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
- i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
@@ -1419,8 +1489,8 @@ def : GCNPat<
class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
PatFrag constant_ld> : GCNPat <
(vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
>;
multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
@@ -1428,12 +1498,12 @@ multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Ins
def : GCNPat <
(vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$slc))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0)
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0)
>;
def : GCNPat <
(vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
- (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0)
+ (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0)
>;
}
@@ -1454,8 +1524,8 @@ multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
def : GCNPat <
(vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
- (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
+ (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
>;
}
@@ -1478,12 +1548,12 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
def : GCNPat <
(vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset))),
- (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0)
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
>;
def : GCNPat <
(vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
- (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0)
+ (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
>;
}
@@ -1493,12 +1563,12 @@ multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
ValueType vt, PatFrag ld_frag> {
def : GCNPat <
(ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in),
- (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, $in)
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in)
>;
def : GCNPat <
(ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in),
- (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, $in)
+ (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in)
>;
}
@@ -1512,7 +1582,10 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET,
defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, zextloadi16_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>;
+
+foreach vt = Reg32Types.types in {
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>;
+}
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
@@ -1535,16 +1608,16 @@ defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D1
multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_st> {
- // Store follows atomic op convention so address is forst
+ // Store follows atomic op convention so address is first
def : GCNPat <
(atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$slc), vt:$val),
- (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0)
+ (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0)
>;
def : GCNPat <
(atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
- (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0)
+ (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0)
>;
}
let SubtargetPredicate = isGFX6GFX7 in {
@@ -1558,8 +1631,8 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
def : GCNPat <
(st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)),
- (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)),
+ (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
>;
}
@@ -1573,13 +1646,13 @@ multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
def : GCNPat <
(st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset)),
- (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0)
+ (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
>;
def : GCNPat <
(st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
u16imm:$offset)),
- (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0)
+ (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
>;
}
@@ -1587,7 +1660,11 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET,
defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>;
+
+foreach vt = Reg32Types.types in {
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, vt, store_private>;
+}
+
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private, VReg_64>;
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OFFSET, v3i32, store_private, VReg_96>;
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>;
@@ -1613,37 +1690,41 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D
multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, 0)),
+ (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, imm)),
+ (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, 0)),
+ (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, imm)),
+ (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
}
@@ -1671,37 +1752,41 @@ let SubtargetPredicate = HasPackedD16VMem in {
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, 0),
+ (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, imm),
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, 0),
+ (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
- imm:$offset, imm:$format, imm:$cachepolicy, imm),
+ timm:$offset, timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
}
@@ -1957,10 +2042,9 @@ defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03a>;
defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03b>;
defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03c>;
defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03d>;
-// FIXME-GFX6-GFX7-GFX10: Add following instructions:
-//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>;
-//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>;
-//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>;
+defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>;
+defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>;
+defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>;
defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x050>;
defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x051>;
defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x052>;
@@ -1975,10 +2059,9 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05b>;
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>;
// FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7.
-// FIXME-GFX6-GFX7-GFX10: Add following instructions:
-//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
-//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
-//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
+defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
+defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
+defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>;
defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>;
@@ -2353,7 +2436,7 @@ let SubtargetPredicate = HasPackedD16VMem in {
def MUBUFInfoTable : GenericTable {
let FilterClass = "MUBUF_Pseudo";
let CppTypeName = "MUBUFInfo";
- let Fields = ["Opcode", "BaseOpcode", "dwords", "has_vaddr", "has_srsrc", "has_soffset"];
+ let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"];
let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getMUBUFOpcodeHelper";
@@ -2364,7 +2447,26 @@ def getMUBUFInfoFromOpcode : SearchIndex {
let Key = ["Opcode"];
}
-def getMUBUFInfoFromBaseOpcodeAndDwords : SearchIndex {
+def getMUBUFInfoFromBaseOpcodeAndElements : SearchIndex {
let Table = MUBUFInfoTable;
- let Key = ["BaseOpcode", "dwords"];
+ let Key = ["BaseOpcode", "elements"];
+}
+
+def MTBUFInfoTable : GenericTable {
+ let FilterClass = "MTBUF_Pseudo";
+ let CppTypeName = "MTBUFInfo";
+ let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getMTBUFOpcodeHelper";
+}
+
+def getMTBUFInfoFromOpcode : SearchIndex {
+ let Table = MTBUFInfoTable;
+ let Key = ["Opcode"];
+}
+
+def getMTBUFInfoFromBaseOpcodeAndElements : SearchIndex {
+ let Table = MTBUFInfoTable;
+ let Key = ["BaseOpcode", "elements"];
}
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index c52eaaa3fdc5..816ec14a0e98 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -81,6 +81,17 @@ class DS_Real <DS_Pseudo ds> :
// DS Pseudo instructions
+class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs),
+ (ins rc:$data0, offset:$offset, gds:$gds),
+ "$data0$offset$gds"> {
+
+ let has_addr = 0;
+ let has_data1 = 0;
+ let has_vdst = 0;
+}
+
class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs),
@@ -317,13 +328,16 @@ class DS_GWS <string opName, dag ins, string asmOps>
class DS_GWS_0D <string opName>
: DS_GWS<opName,
- (ins offset:$offset, gds:$gds), "$offset gds">;
+ (ins offset:$offset, gds:$gds), "$offset gds"> {
+ let hasSideEffects = 1;
+}
class DS_GWS_1D <string opName>
: DS_GWS<opName,
(ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> {
let has_gws_data0 = 1;
+ let hasSideEffects = 1;
}
class DS_VOID <string opName> : DS_Pseudo<opName,
@@ -391,11 +405,12 @@ def DS_WRITE_B8_D16_HI : DS_1A1D_NORET<"ds_write_b8_d16_hi">;
def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">;
}
+} // End has_m0_read = 0
+
let SubtargetPredicate = HasDSAddTid in {
-def DS_WRITE_ADDTID_B32 : DS_1A1D_NORET<"ds_write_addtid_b32">;
+def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">;
}
-} // End has_m0_read = 0
} // End mayLoad = 0
defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">;
@@ -540,13 +555,14 @@ def DS_READ_I8_D16_HI : DS_1A_RET_Tied<"ds_read_i8_d16_hi">;
def DS_READ_U16_D16 : DS_1A_RET_Tied<"ds_read_u16_d16">;
def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">;
}
+} // End has_m0_read = 0
let SubtargetPredicate = HasDSAddTid in {
-def DS_READ_ADDTID_B32 : DS_1A_RET<"ds_read_addtid_b32">;
-}
-} // End has_m0_read = 0
+def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">;
}
+} // End mayStore = 0
+
def DS_CONSUME : DS_0A_RET<"ds_consume">;
def DS_APPEND : DS_0A_RET<"ds_append">;
def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
@@ -600,13 +616,13 @@ def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
//===----------------------------------------------------------------------===//
def : GCNPat <
- (int_amdgcn_ds_swizzle i32:$src, imm:$offset16),
+ (int_amdgcn_ds_swizzle i32:$src, timm:$offset16),
(DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
>;
class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
- (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
- (inst $ptr, (as_i16imm $offset), (i1 gds))
+ (vt (frag (DS1Addr1Offset i32:$ptr, i16:$offset))),
+ (inst $ptr, offset:$offset, (i1 gds))
>;
multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
@@ -621,8 +637,8 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
- (inst $ptr, (as_i16imm $offset), (i1 0), $in)
+ (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$in),
+ (inst $ptr, offset:$offset, (i1 0), $in)
>;
defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
@@ -636,13 +652,20 @@ defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
-defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
+
+foreach vt = Reg32Types.types in {
+defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">;
+}
+
defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
let AddedComplexity = 100 in {
-defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">;
+foreach vt = VReg_64.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
+}
+
defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
} // End AddedComplexity = 100
@@ -664,8 +687,8 @@ def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>;
}
class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
- (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
- (inst $ptr, $value, (as_i16imm $offset), (i1 gds))
+ (frag vt:$value, (DS1Addr1Offset i32:$ptr, i16:$offset)),
+ (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))
>;
multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
@@ -681,8 +704,8 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
// Irritatingly, atomic_store reverses the order of operands from a
// normal store.
class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
- (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+ (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
+ (inst $ptr, $value, offset:$offset, (i1 0))
>;
multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
@@ -699,9 +722,13 @@ defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">;
defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
-defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local">;
+
+foreach vt = VGPR_32.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
+}
+
+defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local_32">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local_64">;
let OtherPredicates = [D16PreservesUnusedBits] in {
def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
@@ -736,46 +763,49 @@ def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
let AddedComplexity = 100 in {
-defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">;
+foreach vt = VReg_64.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">;
+}
+
defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">;
} // End AddedComplexity = 100
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
- (inst $ptr, $value, (as_i16imm $offset), (i1 gds))
+ (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
+ (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))
>;
multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0")>;
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local")>;
+ !cast<PatFrag>(frag#"_local_"#vt.Size)>;
}
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>;
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>;
}
class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
- (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 gds))
+ (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
+ (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))
>;
multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0")>;
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local")>;
+ !cast<PatFrag>(frag#"_local_"#vt.Size)>;
}
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>;
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>;
}
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 4ec4be9bc485..ec2e2c4e8b71 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1095,6 +1095,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
case 106: return createRegOperand(VCC);
case 108: return createRegOperand(TBA);
case 110: return createRegOperand(TMA);
+ case 125: return createRegOperand(SGPR_NULL);
case 126: return createRegOperand(EXEC);
case 235: return createRegOperand(SRC_SHARED_BASE);
case 236: return createRegOperand(SRC_SHARED_LIMIT);
@@ -1172,7 +1173,8 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
int TTmpIdx = getTTmpIdx(Val);
if (TTmpIdx >= 0) {
- return createSRegOperand(getTtmpClassId(OPW64), TTmpIdx);
+ auto TTmpClsId = getTtmpClassId(IsWave64 ? OPW64 : OPW32);
+ return createSRegOperand(TTmpClsId, TTmpIdx);
} else if (Val > SGPR_MAX) {
return IsWave64 ? decodeSpecialReg64(Val)
: decodeSpecialReg32(Val);
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 0550092ce1d6..792e26d21f98 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -322,46 +322,46 @@ def : EGOrCaymanPat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$
defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_RTN,
RAT_ATOMIC_XCHG_INT_NORET,
- atomic_swap_global_ret,
- atomic_swap_global_noret>;
+ atomic_swap_global_ret_32,
+ atomic_swap_global_noret_32>;
defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_RTN, RAT_ATOMIC_ADD_NORET,
- atomic_add_global_ret, atomic_add_global_noret>;
+ atomic_load_add_global_ret_32, atomic_load_add_global_noret_32>;
defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_RTN, RAT_ATOMIC_SUB_NORET,
- atomic_sub_global_ret, atomic_sub_global_noret>;
+ atomic_load_sub_global_ret_32, atomic_load_sub_global_noret_32>;
defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_RTN,
RAT_ATOMIC_MIN_INT_NORET,
- atomic_min_global_ret, atomic_min_global_noret>;
+ atomic_load_min_global_ret_32, atomic_load_min_global_noret_32>;
defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_RTN,
RAT_ATOMIC_MIN_UINT_NORET,
- atomic_umin_global_ret, atomic_umin_global_noret>;
+ atomic_load_umin_global_ret_32, atomic_load_umin_global_noret_32>;
defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_RTN,
RAT_ATOMIC_MAX_INT_NORET,
- atomic_max_global_ret, atomic_max_global_noret>;
+ atomic_load_max_global_ret_32, atomic_load_max_global_noret_32>;
defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_RTN,
RAT_ATOMIC_MAX_UINT_NORET,
- atomic_umax_global_ret, atomic_umax_global_noret>;
+ atomic_load_umax_global_ret_32, atomic_load_umax_global_noret_32>;
defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_RTN, RAT_ATOMIC_AND_NORET,
- atomic_and_global_ret, atomic_and_global_noret>;
+ atomic_load_and_global_ret_32, atomic_load_and_global_noret_32>;
defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_RTN, RAT_ATOMIC_OR_NORET,
- atomic_or_global_ret, atomic_or_global_noret>;
+ atomic_load_or_global_ret_32, atomic_load_or_global_noret_32>;
defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_RTN, RAT_ATOMIC_XOR_NORET,
- atomic_xor_global_ret, atomic_xor_global_noret>;
+ atomic_load_xor_global_ret_32, atomic_load_xor_global_noret_32>;
defm AtomicIncAddPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN,
RAT_ATOMIC_INC_UINT_NORET,
- atomic_add_global_ret,
- atomic_add_global_noret, 1>;
+ atomic_load_add_global_ret_32,
+ atomic_load_add_global_noret_32, 1>;
defm AtomicIncSubPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN,
RAT_ATOMIC_INC_UINT_NORET,
- atomic_sub_global_ret,
- atomic_sub_global_noret, -1>;
+ atomic_load_sub_global_ret_32,
+ atomic_load_sub_global_noret_32, -1>;
defm AtomicDecAddPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN,
RAT_ATOMIC_DEC_UINT_NORET,
- atomic_add_global_ret,
- atomic_add_global_noret, -1>;
+ atomic_load_add_global_ret_32,
+ atomic_load_add_global_noret_32, -1>;
defm AtomicDecSubPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN,
RAT_ATOMIC_DEC_UINT_NORET,
- atomic_sub_global_ret,
- atomic_sub_global_noret, 1>;
+ atomic_load_sub_global_ret_32,
+ atomic_load_sub_global_noret_32, 1>;
// Should be predicated on FeatureFP64
// def FMA_64 : R600_3OP <
@@ -628,37 +628,37 @@ def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
[(truncstorei16_local i32:$src1, i32:$src0)]
>;
def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
- [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_add_local_32 i32:$src0, i32:$src1))]
>;
def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
- [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_sub_local_32 i32:$src0, i32:$src1))]
>;
def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND",
- [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_and_local_32 i32:$src0, i32:$src1))]
>;
def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR",
- [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_or_local_32 i32:$src0, i32:$src1))]
>;
def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR",
- [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_xor_local_32 i32:$src0, i32:$src1))]
>;
def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT",
- [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_min_local_32 i32:$src0, i32:$src1))]
>;
def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT",
- [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_max_local_32 i32:$src0, i32:$src1))]
>;
def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT",
- [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_umin_local_32 i32:$src0, i32:$src1))]
>;
def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT",
- [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_umax_local_32 i32:$src0, i32:$src1))]
>;
def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG",
- [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_swap_local_32 i32:$src0, i32:$src1))]
>;
def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST",
- [(set i32:$dst, (atomic_cmp_swap_local i32:$src0, i32:$src1, i32:$src2))]
+ [(set i32:$dst, (atomic_cmp_swap_local_32 i32:$src0, i32:$src1, i32:$src2))]
>;
def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
[(set (i32 R600_Reg32:$dst), (load_local R600_Reg32:$src0))]
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 889f60dae920..80ee17eba141 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -270,7 +270,7 @@ multiclass FLAT_Atomic_Pseudo<
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = getIsFP<data_vt>.ret> {
+ bit isFP = isFloatType<data_vt>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
(ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
@@ -300,7 +300,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = getIsFP<data_vt>.ret> {
+ bit isFP = isFloatType<data_vt>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
@@ -333,7 +333,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = getIsFP<data_vt>.ret> {
+ bit isFP = isFloatType<data_vt>.ret> {
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
@@ -564,76 +564,76 @@ defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswa
v2i64, VReg_128>;
defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap",
- VGPR_32, i32, atomic_swap_global>;
+ VGPR_32, i32, atomic_swap_global_32>;
defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2",
- VReg_64, i64, atomic_swap_global>;
+ VReg_64, i64, atomic_swap_global_64>;
defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add",
- VGPR_32, i32, atomic_add_global>;
+ VGPR_32, i32, atomic_load_add_global_32>;
defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub",
- VGPR_32, i32, atomic_sub_global>;
+ VGPR_32, i32, atomic_load_sub_global_32>;
defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin",
- VGPR_32, i32, atomic_min_global>;
+ VGPR_32, i32, atomic_load_min_global_32>;
defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin",
- VGPR_32, i32, atomic_umin_global>;
+ VGPR_32, i32, atomic_load_umin_global_32>;
defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax",
- VGPR_32, i32, atomic_max_global>;
+ VGPR_32, i32, atomic_load_max_global_32>;
defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax",
- VGPR_32, i32, atomic_umax_global>;
+ VGPR_32, i32, atomic_load_umax_global_32>;
defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and",
- VGPR_32, i32, atomic_and_global>;
+ VGPR_32, i32, atomic_load_and_global_32>;
defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or",
- VGPR_32, i32, atomic_or_global>;
+ VGPR_32, i32, atomic_load_or_global_32>;
defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor",
- VGPR_32, i32, atomic_xor_global>;
+ VGPR_32, i32, atomic_load_xor_global_32>;
defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc",
- VGPR_32, i32, atomic_inc_global>;
+ VGPR_32, i32, atomic_inc_global_32>;
defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec",
- VGPR_32, i32, atomic_dec_global>;
+ VGPR_32, i32, atomic_dec_global_32>;
defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2",
- VReg_64, i64, atomic_add_global>;
+ VReg_64, i64, atomic_load_add_global_64>;
defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2",
- VReg_64, i64, atomic_sub_global>;
+ VReg_64, i64, atomic_load_sub_global_64>;
defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2",
- VReg_64, i64, atomic_min_global>;
+ VReg_64, i64, atomic_load_min_global_64>;
defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2",
- VReg_64, i64, atomic_umin_global>;
+ VReg_64, i64, atomic_load_umin_global_64>;
defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2",
- VReg_64, i64, atomic_max_global>;
+ VReg_64, i64, atomic_load_max_global_64>;
defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2",
- VReg_64, i64, atomic_umax_global>;
+ VReg_64, i64, atomic_load_umax_global_64>;
defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2",
- VReg_64, i64, atomic_and_global>;
+ VReg_64, i64, atomic_load_and_global_64>;
defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2",
- VReg_64, i64, atomic_or_global>;
+ VReg_64, i64, atomic_load_or_global_64>;
defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2",
- VReg_64, i64, atomic_xor_global>;
+ VReg_64, i64, atomic_load_xor_global_64>;
defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2",
- VReg_64, i64, atomic_inc_global>;
+ VReg_64, i64, atomic_inc_global_64>;
defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
- VReg_64, i64, atomic_dec_global>;
+ VReg_64, i64, atomic_dec_global_64>;
} // End is_flat_global = 1
} // End SubtargetPredicate = HasFlatGlobalInsts
@@ -686,10 +686,10 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in {
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
- "global_atomic_add_f32", VGPR_32, f32, atomic_add_global
+ "global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
>;
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
- "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global
+ "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret
>;
} // End SubtargetPredicate = HasAtomicFaddInsts
@@ -777,8 +777,6 @@ def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, v2i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, v4i32>;
@@ -787,41 +785,50 @@ def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORD, store_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32, VReg_64>;
+
+foreach vt = Reg32Types.types in {
+def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>;
+def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt, VReg_64>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
+}
+
def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32, VReg_96>;
def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32, VReg_128>;
def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>;
def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64, VReg_64>;
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
-
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>;
def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
@@ -847,9 +854,6 @@ def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
} // End OtherPredicates = [HasFlatAddressSpace]
-def atomic_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_fadd>;
-def atomic_pk_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_pk_fadd>;
-
let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in {
def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
@@ -863,8 +867,16 @@ def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, v2i32>;
+foreach vt = Reg32Types.types in {
+def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, vt>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, vt, VGPR_32>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, vt>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, vt, VReg_64>;
+}
+
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX3, load_global, v3i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, v4i32>;
@@ -875,8 +887,6 @@ def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32, VGPR_32>;
def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16, VGPR_32>;
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32, VGPR_32>;
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32, VReg_64>;
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX3, store_global, v3i32, VReg_96>;
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32, VReg_128>;
@@ -902,36 +912,36 @@ def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64, VReg_64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_and_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_or_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
-
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
+
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>;
-def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global, f32>;
-def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global, v2f16>;
+def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global_noret, f32>;
+def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global_noret, v2f16>;
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
@@ -1174,7 +1184,7 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
let AssemblerPredicate = isGFX10Plus;
let DecoderNamespace = "GFX10";
- let Inst{11-0} = {offset{12}, offset{10-0}};
+ let Inst{11-0} = offset{11-0};
let Inst{12} = !if(ps.has_dlc, dlc, ps.dlcValue);
let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d);
let Inst{55} = 0;
diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp
index e1845e2e8e87..98678873e37c 100644
--- a/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -41,6 +41,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -155,8 +156,6 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
RegSubRegPair CombOldVGPR,
bool CombBCZ) const {
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
- assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
- TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
auto OrigOp = OrigMI.getOpcode();
auto DPPOp = getDPPOp(OrigOp);
@@ -178,7 +177,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
if (OldIdx != -1) {
assert(OldIdx == NumOperands);
assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
- DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg);
+ auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI);
+ DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
+ CombOldVGPR.SubReg);
++NumOperands;
} else {
// TODO: this discards MAC/FMA instructions for now, let's add it later
@@ -195,6 +196,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
DPPInst.addImm(Mod0->getImm());
++NumOperands;
+ } else if (AMDGPU::getNamedOperandIdx(DPPOp,
+ AMDGPU::OpName::src0_modifiers) != -1) {
+ DPPInst.addImm(0);
+ ++NumOperands;
}
auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
assert(Src0);
@@ -214,6 +219,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
DPPInst.addImm(Mod1->getImm());
++NumOperands;
+ } else if (AMDGPU::getNamedOperandIdx(DPPOp,
+ AMDGPU::OpName::src1_modifiers) != -1) {
+ DPPInst.addImm(0);
+ ++NumOperands;
}
if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
@@ -344,6 +353,10 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
assert(DstOpnd && DstOpnd->isReg());
auto DPPMovReg = DstOpnd->getReg();
+ if (DPPMovReg.isPhysical()) {
+ LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n");
+ return false;
+ }
if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
" for all uses\n");
@@ -362,7 +375,13 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
bool BoundCtrlZero = BCZOpnd->getImm();
auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
+ auto *SrcOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
assert(OldOpnd && OldOpnd->isReg());
+ assert(SrcOpnd && SrcOpnd->isReg());
+ if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) {
+ LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n");
+ return false;
+ }
auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
// OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
@@ -408,6 +427,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
+ DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos;
auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
// try to reuse previous old reg if its undefined (IMPLICIT_DEF)
if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
@@ -420,13 +440,49 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
OrigMIs.push_back(&MovMI);
bool Rollback = true;
+ SmallVector<MachineOperand*, 16> Uses;
+
for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
+ Uses.push_back(&Use);
+ }
+
+ while (!Uses.empty()) {
+ MachineOperand *Use = Uses.pop_back_val();
Rollback = true;
- auto &OrigMI = *Use.getParent();
+ auto &OrigMI = *Use->getParent();
LLVM_DEBUG(dbgs() << " try: " << OrigMI);
auto OrigOp = OrigMI.getOpcode();
+ if (OrigOp == AMDGPU::REG_SEQUENCE) {
+ Register FwdReg = OrigMI.getOperand(0).getReg();
+ unsigned FwdSubReg = 0;
+
+ if (execMayBeModifiedBeforeAnyUse(*MRI, FwdReg, OrigMI)) {
+ LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
+ " for all uses\n");
+ break;
+ }
+
+ unsigned OpNo, E = OrigMI.getNumOperands();
+ for (OpNo = 1; OpNo < E; OpNo += 2) {
+ if (OrigMI.getOperand(OpNo).getReg() == DPPMovReg) {
+ FwdSubReg = OrigMI.getOperand(OpNo + 1).getImm();
+ break;
+ }
+ }
+
+ if (!FwdSubReg)
+ break;
+
+ for (auto &Op : MRI->use_nodbg_operands(FwdReg)) {
+ if (Op.getSubReg() == FwdSubReg)
+ Uses.push_back(&Op);
+ }
+ RegSeqWithOpNos[&OrigMI].push_back(OpNo);
+ continue;
+ }
+
if (TII->isVOP3(OrigOp)) {
if (!TII->hasVALU32BitEncoding(OrigOp)) {
LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
@@ -447,14 +503,14 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
}
LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
- if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
+ if (Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
OldOpndValue, CombBCZ)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
} else if (OrigMI.isCommutable() &&
- &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+ Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
auto *BB = OrigMI.getParent();
auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
BB->insert(OrigMI, NewMI);
@@ -475,9 +531,22 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
OrigMIs.push_back(&OrigMI);
}
+ Rollback |= !Uses.empty();
+
for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
MI->eraseFromParent();
+ if (!Rollback) {
+ for (auto &S : RegSeqWithOpNos) {
+ if (MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) {
+ S.first->eraseFromParent();
+ continue;
+ }
+ while (!S.second.empty())
+ S.first->getOperand(S.second.pop_back_val()).setIsUndef(true);
+ }
+ }
+
return !Rollback;
}
@@ -498,6 +567,13 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
Changed = true;
++NumDPPMovsCombined;
+ } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
+ auto Split = TII->expandMovDPP64(MI);
+ for (auto M : { Split.first, Split.second }) {
+ if (combineDPPMov(*M))
+ ++NumDPPMovsCombined;
+ }
+ Changed = true;
}
}
}
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 885239e2faed..9528aee4c50e 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -726,7 +726,7 @@ int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
if (!TRI->isVGPR(MRI, Def.getReg()))
return WaitStatesNeeded;
- unsigned Reg = Def.getReg();
+ Register Reg = Def.getReg();
auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
int DataIdx = createsVALUHazard(*MI);
return DataIdx >= 0 &&
@@ -792,7 +792,7 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
return 0;
- unsigned LaneSelectReg = LaneSelectOp->getReg();
+ Register LaneSelectReg = LaneSelectOp->getReg();
auto IsHazardFn = [TII] (MachineInstr *MI) {
return TII->isVALU(*MI);
};
@@ -891,7 +891,7 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
// Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
// which is always a VGPR and available.
auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
- unsigned Reg = Src0->getReg();
+ Register Reg = Src0->getReg();
bool IsUndef = Src0->isUndef();
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_MOV_B32_e32))
@@ -952,6 +952,7 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
unsigned SDSTName;
switch (MI->getOpcode()) {
case AMDGPU::V_READLANE_B32:
+ case AMDGPU::V_READLANE_B32_gfx10:
case AMDGPU::V_READFIRSTLANE_B32:
SDSTName = AMDGPU::OpName::vdst;
break;
@@ -976,7 +977,7 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
if (!SDST)
return false;
- const unsigned SDSTReg = SDST->getReg();
+ const Register SDSTReg = SDST->getReg();
auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
};
@@ -1251,14 +1252,14 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
const int MaxWaitStates = 18;
- unsigned Reg = Op.getReg();
+ Register Reg = Op.getReg();
unsigned HazardDefLatency = 0;
auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
(MachineInstr *MI) {
if (!IsMFMAFn(MI))
return false;
- unsigned DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
if (DstReg == Reg)
return false;
HazardDefLatency = std::max(HazardDefLatency,
@@ -1304,7 +1305,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
return false;
- unsigned DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
return TRI.regsOverlap(Reg, DstReg);
};
@@ -1330,14 +1331,14 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
const int MaxWaitStates = 13;
- unsigned DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
unsigned HazardDefLatency = 0;
auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
(MachineInstr *MI) {
if (!IsMFMAFn(MI))
return false;
- unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+ Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
HazardDefLatency = std::max(HazardDefLatency,
TSchedModel.computeInstrLatency(MI));
return TRI.regsOverlap(Reg, DstReg);
@@ -1376,7 +1377,7 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
continue;
- unsigned Reg = Op.getReg();
+ Register Reg = Op.getReg();
const int AccVgprReadLdStWaitStates = 2;
const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp
index 1eb617640c32..39072af7d871 100644
--- a/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
using namespace llvm;
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 3525174223bd..90ab6a14ce20 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -237,7 +237,7 @@ public:
GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,
StrategyKind S)
- : BaseClass(C, llvm::make_unique<SchedStrategyStub>())
+ : BaseClass(C, std::make_unique<SchedStrategyStub>())
, Context(C)
, Strategy(S)
, UPTracker(*LIS) {
diff --git a/lib/Target/AMDGPU/GCNNSAReassign.cpp b/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 51c4c99cfb18..36a8f74150f5 100644
--- a/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -173,11 +173,11 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
bool NSA = false;
for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
- unsigned Reg = Op.getReg();
- if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+ Register Reg = Op.getReg();
+ if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
return NSA_Status::FIXED;
- unsigned PhysReg = VRM->getPhys(Reg);
+ Register PhysReg = VRM->getPhys(Reg);
if (!Fast) {
if (!PhysReg)
@@ -276,7 +276,7 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
SlotIndex MinInd, MaxInd;
for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
- unsigned Reg = Op.getReg();
+ Register Reg = Op.getReg();
LiveInterval *LI = &LIS->getInterval(Reg);
if (llvm::find(Intervals, LI) != Intervals.end()) {
// Same register used, unable to make sequential
diff --git a/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/lib/Target/AMDGPU/GCNRegBankReassign.cpp
index f0d47eaa4ed1..2927d4eb745a 100644
--- a/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ b/lib/Target/AMDGPU/GCNRegBankReassign.cpp
@@ -230,7 +230,7 @@ private:
public:
Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
return Printable([Reg, SubReg, this](raw_ostream &OS) {
- if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ if (Register::isPhysicalRegister(Reg)) {
OS << llvm::printReg(Reg, TRI);
return;
}
@@ -275,7 +275,7 @@ char GCNRegBankReassign::ID = 0;
char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
- assert (TargetRegisterInfo::isPhysicalRegister(Reg));
+ assert(Register::isPhysicalRegister(Reg));
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
unsigned Size = TRI->getRegSizeInBits(*RC);
@@ -293,7 +293,7 @@ unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
int Bank) {
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
if (!VRM->isAssignedReg(Reg))
return 0;
@@ -364,7 +364,7 @@ unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
if (!Op.isReg() || Op.isUndef())
continue;
- unsigned R = Op.getReg();
+ Register R = Op.getReg();
if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
continue;
@@ -420,12 +420,12 @@ unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
}
bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
- if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+ if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
return false;
const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
- unsigned PhysReg = VRM->getPhys(Reg);
+ Register PhysReg = VRM->getPhys(Reg);
if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
return false;
@@ -654,7 +654,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
}
std::sort(BankStalls.begin(), BankStalls.end());
- unsigned OrigReg = VRM->getPhys(C.Reg);
+ Register OrigReg = VRM->getPhys(C.Reg);
LRM->unassign(LI);
while (!BankStalls.empty()) {
BankStall BS = BankStalls.pop_back_val();
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 39460fbd8a84..d593204cba05 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -40,7 +40,7 @@ void llvm::printLivesAt(SlotIndex SI,
<< *LIS.getInstructionFromIndex(SI);
unsigned Num = 0;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+ const unsigned Reg = Register::index2VirtReg(I);
if (!LIS.hasInterval(Reg))
continue;
const auto &LI = LIS.getInterval(Reg);
@@ -84,7 +84,7 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
unsigned GCNRegPressure::getRegKind(unsigned Reg,
const MachineRegisterInfo &MRI) {
- assert(TargetRegisterInfo::isVirtualRegister(Reg));
+ assert(Register::isVirtualRegister(Reg));
const auto RC = MRI.getRegClass(Reg);
auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
return STI->isSGPRClass(RC) ?
@@ -183,7 +183,8 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
- OS << "VGPRs: " << getVGPRNum();
+ OS << "VGPRs: " << Value[VGPR32] << ' ';
+ OS << "AGPRs: " << Value[AGPR32];
if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
OS << ", SGPRs: " << getSGPRNum();
if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')';
@@ -196,8 +197,7 @@ void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
static LaneBitmask getDefRegMask(const MachineOperand &MO,
const MachineRegisterInfo &MRI) {
- assert(MO.isDef() && MO.isReg() &&
- TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+ assert(MO.isDef() && MO.isReg() && Register::isVirtualRegister(MO.getReg()));
// We don't rely on read-undef flag because in case of tentative schedule
// tracking it isn't set correctly yet. This works correctly however since
@@ -210,8 +210,7 @@ static LaneBitmask getDefRegMask(const MachineOperand &MO,
static LaneBitmask getUsedRegMask(const MachineOperand &MO,
const MachineRegisterInfo &MRI,
const LiveIntervals &LIS) {
- assert(MO.isUse() && MO.isReg() &&
- TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+ assert(MO.isUse() && MO.isReg() && Register::isVirtualRegister(MO.getReg()));
if (auto SubReg = MO.getSubReg())
return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
@@ -232,7 +231,7 @@ collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS,
const MachineRegisterInfo &MRI) {
SmallVector<RegisterMaskPair, 8> Res;
for (const auto &MO : MI.operands()) {
- if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
continue;
if (!MO.isUse() || !MO.readsReg())
continue;
@@ -278,7 +277,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
const MachineRegisterInfo &MRI) {
GCNRPTracker::LiveRegSet LiveRegs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- auto Reg = TargetRegisterInfo::index2VirtReg(I);
+ auto Reg = Register::index2VirtReg(I);
if (!LIS.hasInterval(Reg))
continue;
auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
@@ -329,8 +328,7 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
MaxPressure = max(AtMIPressure, MaxPressure);
for (const auto &MO : MI.defs()) {
- if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()) ||
- MO.isDead())
+ if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()) || MO.isDead())
continue;
auto Reg = MO.getReg();
@@ -408,8 +406,8 @@ void GCNDownwardRPTracker::advanceToNext() {
for (const auto &MO : LastTrackedMI->defs()) {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ Register Reg = MO.getReg();
+ if (!Register::isVirtualRegister(Reg))
continue;
auto &LiveMask = LiveRegs[Reg];
auto PrevMask = LiveMask;
@@ -500,7 +498,7 @@ void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
const MachineRegisterInfo &MRI) {
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+ unsigned Reg = Register::index2VirtReg(I);
auto It = LiveRegs.find(Reg);
if (It != LiveRegs.end() && It->second.any())
OS << ' ' << printVRegOrUnit(Reg, TRI) << ':'
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index e4894418b943..5862cdb04166 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -214,7 +214,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap;
SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- auto Reg = TargetRegisterInfo::index2VirtReg(I);
+ auto Reg = Register::index2VirtReg(I);
if (!LIS.hasInterval(Reg))
continue;
auto &LI = LIS.getInterval(Reg);
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 4ea990ae490e..973491a70d3c 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -71,8 +71,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
// the tracker, so we need to pass those function a non-const copy.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
- std::vector<unsigned> Pressure;
- std::vector<unsigned> MaxPressure;
+ Pressure.clear();
+ MaxPressure.clear();
if (AtTop)
TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
@@ -103,10 +103,10 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
// the analysis to look through dependencies to find the path with the least
// register pressure.
- // We only need to update the RPDelata for instructions that increase
- // register pressure. Instructions that decrease or keep reg pressure
- // the same will be marked as RegExcess in tryCandidate() when they
- // are compared with instructions that increase the register pressure.
+ // We only need to update the RPDelta for instructions that increase register
+ // pressure. Instructions that decrease or keep reg pressure the same will be
+ // marked as RegExcess in tryCandidate() when they are compared with
+ // instructions that increase the register pressure.
if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet());
Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
@@ -160,6 +160,7 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
if (TryCand.ResDelta == SchedResourceDelta())
TryCand.initResourceDelta(Zone.DAG, SchedModel);
Cand.setBest(TryCand);
+ LLVM_DEBUG(traceCandidate(Cand));
}
}
}
@@ -195,6 +196,15 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
assert(BotCand.Reason != NoCand && "failed to find the first candidate");
} else {
LLVM_DEBUG(traceCandidate(BotCand));
+#ifndef NDEBUG
+ if (VerifyScheduling) {
+ SchedCandidate TCand;
+ TCand.reset(CandPolicy());
+ pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
+ assert(TCand.SU == BotCand.SU &&
+ "Last pick result should correspond to re-picking right now");
+ }
+#endif
}
// Check if the top Q has a better candidate.
@@ -206,6 +216,15 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
assert(TopCand.Reason != NoCand && "failed to find the first candidate");
} else {
LLVM_DEBUG(traceCandidate(TopCand));
+#ifndef NDEBUG
+ if (VerifyScheduling) {
+ SchedCandidate TCand;
+ TCand.reset(CandPolicy());
+ pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
+ assert(TCand.SU == TopCand.SU &&
+ "Last pick result should correspond to re-picking right now");
+ }
+#endif
}
// Pick best from BotCand and TopCand.
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index eaf3dee9ba5d..dd687a930c79 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -40,6 +40,9 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
const SIRegisterInfo *SRI,
unsigned SGPRPressure, unsigned VGPRPressure);
+ std::vector<unsigned> Pressure;
+ std::vector<unsigned> MaxPressure;
+
unsigned SGPRExcessLimit;
unsigned VGPRExcessLimit;
unsigned SGPRCriticalLimit;
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 57c0ba26cc3a..1f94ab799122 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -109,7 +109,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
MCContext *Ctx) {
int64_t SignedValue = static_cast<int64_t>(Value);
- switch (static_cast<unsigned>(Fixup.getKind())) {
+ switch (Fixup.getTargetKind()) {
case AMDGPU::fixup_si_sopp_br: {
int64_t BrImm = (SignedValue - 4) / 4;
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 6549a8d7d592..d352219a7a98 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -87,7 +87,7 @@ std::unique_ptr<MCObjectTargetWriter>
llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
bool HasRelocationAddend,
uint8_t ABIVersion) {
- return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
+ return std::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
HasRelocationAddend,
ABIVersion);
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 01b53432cbb7..a9888e6ed924 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -196,6 +196,10 @@ void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
printNamedBit(MI, OpNo, O, "slc");
}
+void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+}
+
void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "tfe");
@@ -292,35 +296,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
}
#endif
- unsigned AltName = AMDGPU::Reg32;
-
- if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg64;
- else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg128;
- else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SReg_96RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg96;
- else if (MRI.getRegClass(AMDGPU::VReg_160RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SReg_160RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg160;
- else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg256;
- else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg512;
- else if (MRI.getRegClass(AMDGPU::VReg_1024RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SReg_1024RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::AReg_1024RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg1024;
-
- O << getRegisterName(RegNo, AltName);
+ O << getRegisterName(RegNo);
}
void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
@@ -623,9 +599,11 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10:
case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10:
case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_CNDMASK_B32_dpp_gfx10:
case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10:
case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10:
case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_CNDMASK_B32_dpp8_gfx10:
case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10:
case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10:
case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10:
@@ -689,6 +667,7 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
switch (MI->getOpcode()) {
default: break;
+ case AMDGPU::V_CNDMASK_B32_sdwa_gfx10:
case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10:
case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10:
case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10:
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index b544d1ef3605..66b70831ff9e 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -12,7 +12,6 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H
-#include "AMDGPUMCTargetDesc.h"
#include "llvm/MC/MCInstPrinter.h"
namespace llvm {
@@ -26,8 +25,7 @@ public:
//Autogenerated by tblgen
void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
raw_ostream &O);
- static const char *getRegisterName(unsigned RegNo,
- unsigned AltIdx = AMDGPU::NoRegAltName);
+ static const char *getRegisterName(unsigned RegNo);
void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
const MCSubtargetInfo &STI) override;
@@ -74,6 +72,8 @@ private:
raw_ostream &O);
void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 8f11433476f4..c15da8075a34 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -250,7 +250,7 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
bool AMDGPUTargetAsmStreamer::EmitCodeEnd() {
const uint32_t Encoded_s_code_end = 0xbf9f0000;
OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n';
- OS << "\t.fill 32, 4, " << Encoded_s_code_end << '\n';
+ OS << "\t.fill 48, 4, " << Encoded_s_code_end << '\n';
return true;
}
@@ -602,7 +602,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd() {
MCStreamer &OS = getStreamer();
OS.PushSection();
OS.EmitValueToAlignment(64, Encoded_s_code_end, 4);
- for (unsigned I = 0; I < 32; ++I)
+ for (unsigned I = 0; I < 48; ++I)
OS.EmitIntValue(Encoded_s_code_end, 4);
OS.PopSection();
return true;
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 4735e6cb2446..f33ad950d5d9 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -26,7 +26,7 @@ def MIMGEncoding : GenericEnum {
// Represent an ISA-level opcode, independent of the encoding and the
// vdata/vaddr size.
-class MIMGBaseOpcode {
+class MIMGBaseOpcode : PredicateControl {
MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(NAME);
bit Store = 0;
bit Atomic = 0;
@@ -291,7 +291,7 @@ multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm,
multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0,
bit isResInfo = 0> {
- def "" : MIMGBaseOpcode, PredicateControl {
+ def "" : MIMGBaseOpcode {
let Coordinates = !if(isResInfo, 0, 1);
let LodOrClampOrMip = mip;
let HasD16 = has_d16;
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp
index 3fb18862fca8..b29cd75f75cf 100644
--- a/lib/Target/AMDGPU/R600AsmPrinter.cpp
+++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -104,7 +104,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// Functions needs to be cacheline (256B) aligned.
- MF.ensureAlignment(8);
+ MF.ensureAlignment(Align(256));
SetupMachineFunction(MF);
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 8098b81d1ea2..e4160ac11c86 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -303,7 +303,7 @@ private:
if (!MO.isReg())
continue;
if (MO.isDef()) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (R600::R600_Reg128RegClass.contains(Reg))
DstMI = Reg;
else
@@ -312,7 +312,7 @@ private:
&R600::R600_Reg128RegClass);
}
if (MO.isUse()) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (R600::R600_Reg128RegClass.contains(Reg))
SrcMI = Reg;
else
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index c6e8a060d8a0..fd75c41040e1 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -135,7 +135,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
const R600RegisterInfo &TRI = TII->getRegisterInfo();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
for (unsigned Chan = 0; Chan < 4; ++Chan) {
@@ -155,12 +155,12 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
unsigned Opcode = BMI->getOpcode();
// While not strictly necessary from hw point of view, we force
// all src operands of a dot4 inst to belong to the same slot.
- unsigned Src0 = BMI->getOperand(
- TII->getOperandIdx(Opcode, R600::OpName::src0))
- .getReg();
- unsigned Src1 = BMI->getOperand(
- TII->getOperandIdx(Opcode, R600::OpName::src1))
- .getReg();
+ Register Src0 =
+ BMI->getOperand(TII->getOperandIdx(Opcode, R600::OpName::src0))
+ .getReg();
+ Register Src1 =
+ BMI->getOperand(TII->getOperandIdx(Opcode, R600::OpName::src1))
+ .getReg();
(void) Src0;
(void) Src1;
if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
@@ -205,10 +205,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
// T0_Z = CUBE T1_X, T1_Z
// T0_W = CUBE T1_Y, T1_Z
for (unsigned Chan = 0; Chan < 4; Chan++) {
- unsigned DstReg = MI.getOperand(
- TII->getOperandIdx(MI, R600::OpName::dst)).getReg();
- unsigned Src0 = MI.getOperand(
- TII->getOperandIdx(MI, R600::OpName::src0)).getReg();
+ Register DstReg =
+ MI.getOperand(TII->getOperandIdx(MI, R600::OpName::dst)).getReg();
+ Register Src0 =
+ MI.getOperand(TII->getOperandIdx(MI, R600::OpName::src0)).getReg();
unsigned Src1 = 0;
// Determine the correct source registers
diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h
index 950e238f4979..283e4d1935ea 100644
--- a/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/lib/Target/AMDGPU/R600FrameLowering.h
@@ -15,9 +15,9 @@ namespace llvm {
class R600FrameLowering : public AMDGPUFrameLowering {
public:
- R600FrameLowering(StackDirection D, unsigned StackAl, int LAO,
- unsigned TransAl = 1) :
- AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+ R600FrameLowering(StackDirection D, Align StackAl, int LAO,
+ Align TransAl = Align::None())
+ : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
~R600FrameLowering() override;
void emitPrologue(MachineFunction &MF,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index f80a53ba1dc6..659458b0b752 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -41,6 +41,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachineValueType.h"
+#include "llvm/Support/MathExtras.h"
#include <cassert>
#include <cstdint>
#include <iterator>
@@ -334,8 +335,8 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
case R600::MASK_WRITE: {
- unsigned maskedRegister = MI.getOperand(0).getReg();
- assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
+ Register maskedRegister = MI.getOperand(0).getReg();
+ assert(Register::isVirtualRegister(maskedRegister));
MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
break;
@@ -782,7 +783,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
return TrigVal;
// On R600 hw, COS/SIN input must be between -Pi and Pi.
return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
- DAG.getConstantFP(3.14159265359, DL, MVT::f32));
+ DAG.getConstantFP(numbers::pif, DL, MVT::f32));
}
SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index d9e839fe2035..04a5e93f6213 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -97,8 +97,8 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) const {
for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
E = MBBI->operands_end(); I != E; ++I) {
- if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) &&
- I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg()))
+ if (I->isReg() && !Register::isVirtualRegister(I->getReg()) && I->isUse() &&
+ RI.isPhysRegLiveAcrossClauses(I->getReg()))
return false;
}
return true;
@@ -242,8 +242,7 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
E = MI.operands_end();
I != E; ++I) {
- if (!I->isReg() || !I->isUse() ||
- TargetRegisterInfo::isVirtualRegister(I->getReg()))
+ if (!I->isReg() || !I->isUse() || Register::isVirtualRegister(I->getReg()))
continue;
if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
@@ -294,7 +293,7 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
for (unsigned j = 0; j < 8; j++) {
MachineOperand &MO =
MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg == R600::ALU_CONST) {
MachineOperand &Sel =
MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
@@ -317,7 +316,7 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
if (SrcIdx < 0)
break;
MachineOperand &MO = MI.getOperand(SrcIdx);
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg == R600::ALU_CONST) {
MachineOperand &Sel =
MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
@@ -348,7 +347,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
unsigned i = 0;
for (const auto &Src : getSrcs(MI)) {
++i;
- unsigned Reg = Src.first->getReg();
+ Register Reg = Src.first->getReg();
int Index = RI.getEncodingValue(Reg) & 0xff;
if (Reg == R600::OQAP) {
Result.push_back(std::make_pair(Index, 0U));
@@ -865,7 +864,7 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
if (idx < 0)
return false;
- unsigned Reg = MI.getOperand(idx).getReg();
+ Register Reg = MI.getOperand(idx).getReg();
switch (Reg) {
default: return false;
case R600::PRED_SEL_ONE:
@@ -1038,7 +1037,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
unsigned Address = calculateIndirectAddress(RegIndex, Channel);
- unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+ Register OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(),
getIndirectAddrRegClass()->getRegister(Address));
@@ -1052,7 +1051,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
unsigned Address = calculateIndirectAddress(RegIndex, Channel);
- unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+ Register OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
MI.getOperand(ValOpIdx).getReg());
@@ -1193,8 +1192,7 @@ int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
for (std::pair<unsigned, unsigned> LI : MRI.liveins()) {
unsigned Reg = LI.first;
- if (TargetRegisterInfo::isVirtualRegister(Reg) ||
- !IndirectRC->contains(Reg))
+ if (Register::isVirtualRegister(Reg) || !IndirectRC->contains(Reg))
continue;
unsigned RegIndex;
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 34267a909b5e..7569a2629539 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -183,7 +183,7 @@ isPhysicalRegCopy(MachineInstr *MI) {
if (MI->getOpcode() != R600::COPY)
return false;
- return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
+ return !Register::isVirtualRegister(MI->getOperand(1).getReg());
}
void R600SchedStrategy::releaseTopNode(SUnit *SU) {
@@ -209,7 +209,7 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
const TargetRegisterClass *RC) const {
- if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (!Register::isVirtualRegister(Reg)) {
return RC->contains(Reg);
} else {
return MRI->getRegClass(Reg) == RC;
@@ -270,7 +270,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
}
// Is the result already member of a X/Y/Z/W class ?
- unsigned DestReg = MI->getOperand(0).getReg();
+ Register DestReg = MI->getOperand(0).getReg();
if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
return AluT_X;
@@ -357,7 +357,7 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
if (DstIndex == -1) {
return;
}
- unsigned DestReg = MI->getOperand(DstIndex).getReg();
+ Register DestReg = MI->getOperand(DstIndex).getReg();
// PressureRegister crashes if an operand is def and used in the same inst
// and we try to constraint its regclass
for (MachineInstr::mop_iterator It = MI->operands_begin(),
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 9f1cb6582b5c..cec7f563f480 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -58,7 +58,7 @@ using namespace llvm;
static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
assert(MRI.isSSA());
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return false;
const MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
return MI && MI->isImplicitDef();
@@ -197,17 +197,17 @@ unsigned getReassignedChan(
MachineInstr *R600VectorRegMerger::RebuildVector(
RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const {
- unsigned Reg = RSI->Instr->getOperand(0).getReg();
+ Register Reg = RSI->Instr->getOperand(0).getReg();
MachineBasicBlock::iterator Pos = RSI->Instr;
MachineBasicBlock &MBB = *Pos->getParent();
DebugLoc DL = Pos->getDebugLoc();
- unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
+ Register SrcVec = BaseRSI->Instr->getOperand(0).getReg();
DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
E = RSI->RegToChan.end(); It != E; ++It) {
- unsigned DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
+ Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
unsigned SubReg = (*It).first;
unsigned Swizzle = (*It).second;
unsigned Chan = getReassignedChan(RemapChan, Swizzle);
@@ -350,7 +350,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
MachineInstr &MI = *MII;
if (MI.getOpcode() != R600::REG_SEQUENCE) {
if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
- unsigned Reg = MI.getOperand(1).getReg();
+ Register Reg = MI.getOperand(1).getReg();
for (MachineRegisterInfo::def_instr_iterator
It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end();
It != E; ++It) {
@@ -363,7 +363,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
RegSeqInfo RSI(*MRI, &MI);
// All uses of MI are swizzeable ?
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
if (!areAllUsesSwizzeable(Reg))
continue;
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index df200baf11c1..176269f9b68c 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -90,7 +90,7 @@ private:
if (DstIdx == -1) {
continue;
}
- unsigned Dst = BI->getOperand(DstIdx).getReg();
+ Register Dst = BI->getOperand(DstIdx).getReg();
if (isTrans || TII->isTransOnly(*BI)) {
Result[Dst] = R600::PS;
continue;
@@ -136,7 +136,7 @@ private:
int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
if (OperandIdx < 0)
continue;
- unsigned Src = MI.getOperand(OperandIdx).getReg();
+ Register Src = MI.getOperand(OperandIdx).getReg();
const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
if (It != PVs.end())
MI.getOperand(OperandIdx).setReg(It->second);
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 685df74490fe..ef12c1d24594 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -93,7 +93,7 @@ const RegClassWeight &R600RegisterInfo::getRegClassWeight(
}
bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
- assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+ assert(!Register::isVirtualRegister(Reg));
switch (Reg) {
case R600::OQAP:
diff --git a/lib/Target/AMDGPU/SIAddIMGInit.cpp b/lib/Target/AMDGPU/SIAddIMGInit.cpp
index f8094e35816c..ee011286b8ff 100644
--- a/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ b/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -129,7 +129,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
continue;
// Create a register for the intialization value.
- unsigned PrevDst =
+ Register PrevDst =
MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
unsigned NewDst = 0; // Final initialized value will be in here
@@ -150,7 +150,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
NewDst =
MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
// Initialize dword
- unsigned SubReg =
+ Register SubReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
.addImm(0);
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index a0e1ec6ac235..23ef56afc39c 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -99,7 +99,10 @@ enum : uint64_t {
FPAtomic = UINT64_C(1) << 53,
// Is a MFMA instruction.
- IsMAI = UINT64_C(1) << 54
+ IsMAI = UINT64_C(1) << 54,
+
+ // Is a DOT instruction.
+ IsDOT = UINT64_C(1) << 55
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -444,6 +447,7 @@ namespace DPP {
enum DppCtrl : unsigned {
QUAD_PERM_FIRST = 0,
+ QUAD_PERM_ID = 0xE4, // identity permutation
QUAD_PERM_LAST = 0xFF,
DPP_UNUSED1 = 0x100,
ROW_SHL0 = 0x100,
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 624953963cf4..65286751c12d 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -113,10 +113,16 @@ class SIFixSGPRCopies : public MachineFunctionPass {
public:
static char ID;
+ MachineRegisterInfo *MRI;
+ const SIRegisterInfo *TRI;
+ const SIInstrInfo *TII;
+
SIFixSGPRCopies() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
+ void processPHINode(MachineInstr &MI);
+
StringRef getPassName() const override { return "SI Fix SGPR copies"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -148,7 +154,7 @@ static bool hasVectorOperands(const MachineInstr &MI,
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
if (!MI.getOperand(i).isReg() ||
- !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+ !Register::isVirtualRegister(MI.getOperand(i).getReg()))
continue;
if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
@@ -161,21 +167,19 @@ static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getCopyRegClasses(const MachineInstr &Copy,
const SIRegisterInfo &TRI,
const MachineRegisterInfo &MRI) {
- unsigned DstReg = Copy.getOperand(0).getReg();
- unsigned SrcReg = Copy.getOperand(1).getReg();
+ Register DstReg = Copy.getOperand(0).getReg();
+ Register SrcReg = Copy.getOperand(1).getReg();
- const TargetRegisterClass *SrcRC =
- TargetRegisterInfo::isVirtualRegister(SrcReg) ?
- MRI.getRegClass(SrcReg) :
- TRI.getPhysRegClass(SrcReg);
+ const TargetRegisterClass *SrcRC = Register::isVirtualRegister(SrcReg)
+ ? MRI.getRegClass(SrcReg)
+ : TRI.getPhysRegClass(SrcReg);
// We don't really care about the subregister here.
// SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
- const TargetRegisterClass *DstRC =
- TargetRegisterInfo::isVirtualRegister(DstReg) ?
- MRI.getRegClass(DstReg) :
- TRI.getPhysRegClass(DstReg);
+ const TargetRegisterClass *DstRC = Register::isVirtualRegister(DstReg)
+ ? MRI.getRegClass(DstReg)
+ : TRI.getPhysRegClass(DstReg);
return std::make_pair(SrcRC, DstRC);
}
@@ -199,10 +203,10 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
const SIInstrInfo *TII) {
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
auto &Src = MI.getOperand(1);
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = Src.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
- !TargetRegisterInfo::isVirtualRegister(DstReg))
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = Src.getReg();
+ if (!Register::isVirtualRegister(SrcReg) ||
+ !Register::isVirtualRegister(DstReg))
return false;
for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
@@ -238,7 +242,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
MachineRegisterInfo &MRI) {
assert(MI.isRegSequence());
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
return false;
@@ -250,7 +254,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
return false;
// It is illegal to have vreg inputs to a physreg defining reg_sequence.
- if (TargetRegisterInfo::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
+ if (Register::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
return false;
const TargetRegisterClass *SrcRC, *DstRC;
@@ -281,7 +285,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
bool IsAGPR = TRI->hasAGPRs(DstRC);
for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
- unsigned SrcReg = MI.getOperand(I).getReg();
+ Register SrcReg = MI.getOperand(I).getReg();
unsigned SrcSubReg = MI.getOperand(I).getSubReg();
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
@@ -291,7 +295,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
- unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
+ Register TmpReg = MRI.createVirtualRegister(NewSrcRC);
BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
TmpReg)
@@ -299,7 +303,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
if (IsAGPR) {
const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
- unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC);
+ Register TmpAReg = MRI.createVirtualRegister(NewSrcRC);
unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
@@ -315,52 +319,6 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
return true;
}
-static bool phiHasVGPROperands(const MachineInstr &PHI,
- const MachineRegisterInfo &MRI,
- const SIRegisterInfo *TRI,
- const SIInstrInfo *TII) {
- for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
- unsigned Reg = PHI.getOperand(i).getReg();
- if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
- return true;
- }
- return false;
-}
-
-static bool phiHasBreakDef(const MachineInstr &PHI,
- const MachineRegisterInfo &MRI,
- SmallSet<unsigned, 8> &Visited) {
- for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
- unsigned Reg = PHI.getOperand(i).getReg();
- if (Visited.count(Reg))
- continue;
-
- Visited.insert(Reg);
-
- MachineInstr *DefInstr = MRI.getVRegDef(Reg);
- switch (DefInstr->getOpcode()) {
- default:
- break;
- case AMDGPU::SI_IF_BREAK:
- return true;
- case AMDGPU::PHI:
- if (phiHasBreakDef(*DefInstr, MRI, Visited))
- return true;
- }
- }
- return false;
-}
-
-static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
- const TargetRegisterInfo &TRI) {
- for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
- E = MBB.end(); I != E; ++I) {
- if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
- return true;
- }
- return false;
-}
-
static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
const MachineInstr *MoveImm,
const SIInstrInfo *TII,
@@ -422,12 +380,6 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
return false;
}
-static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
- const TargetRegisterInfo *TRI) {
- return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
- return hasTerminatorThatModifiesExec(*MBB, *TRI); });
-}
-
// Checks if there is potential path From instruction To instruction.
// If CutOff is specified and it sits in between of that path we ignore
// a higher portion of the path and report it is not reachable.
@@ -468,6 +420,7 @@ getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) {
// executioon.
static bool hoistAndMergeSGPRInits(unsigned Reg,
const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo *TRI,
MachineDominatorTree &MDT,
const TargetInstrInfo *TII) {
// List of inits by immediate value.
@@ -482,7 +435,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
for (auto &MI : MRI.def_instructions(Reg)) {
MachineOperand *Imm = nullptr;
- for (auto &MO: MI.operands()) {
+ for (auto &MO : MI.operands()) {
if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
(!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
Imm = nullptr;
@@ -587,8 +540,44 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
}
}
- for (auto MI : MergedInstrs)
- MI->removeFromParent();
+ // Remove initializations that were merged into another.
+ for (auto &Init : Inits) {
+ auto &Defs = Init.second;
+ auto I = Defs.begin();
+ while (I != Defs.end()) {
+ if (MergedInstrs.count(*I)) {
+ (*I)->eraseFromParent();
+ I = Defs.erase(I);
+ } else
+ ++I;
+ }
+ }
+
+ // Try to schedule SGPR initializations as early as possible in the MBB.
+ for (auto &Init : Inits) {
+ auto &Defs = Init.second;
+ for (auto MI : Defs) {
+ auto MBB = MI->getParent();
+ MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII);
+ MachineBasicBlock::reverse_iterator B(BoundaryMI);
+ // Check if B should actually be a boundary. If not set the previous
+ // instruction as the boundary instead.
+ if (!TII->isBasicBlockPrologue(*B))
+ B++;
+
+ auto R = std::next(MI->getReverseIterator());
+ const unsigned Threshold = 50;
+ // Search until B or Threshold for a place to insert the initialization.
+ for (unsigned I = 0; R != B && I < Threshold; ++R, ++I)
+ if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) ||
+ TII->isSchedulingBoundary(*R, MBB, *MBB->getParent()))
+ break;
+
+ // Move to directly after R.
+ if (&*--R != MI)
+ MBB->splice(*R, MBB, MI);
+ }
+ }
if (Changed)
MRI.clearKillFlags(Reg);
@@ -598,9 +587,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
+ MRI = &MF.getRegInfo();
+ TRI = ST.getRegisterInfo();
+ TII = ST.getInstrInfo();
MDT = &getAnalysis<MachineDominatorTree>();
SmallVector<MachineInstr *, 16> Worklist;
@@ -617,22 +606,39 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
continue;
case AMDGPU::COPY:
case AMDGPU::WQM:
+ case AMDGPU::SOFT_WQM:
case AMDGPU::WWM: {
- // If the destination register is a physical register there isn't really
- // much we can do to fix this.
- if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
- continue;
+ Register DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *SrcRC, *DstRC;
- std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
+
+ if (!Register::isVirtualRegister(DstReg)) {
+ // If the destination register is a physical register there isn't
+ // really much we can do to fix this.
+ // Some special instructions use M0 as an input. Some even only use
+ // the first lane. Insert a readfirstlane and hope for the best.
+ if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
+ Register TmpReg
+ = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ BuildMI(MBB, MI, MI.getDebugLoc(),
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
+ .add(MI.getOperand(1));
+ MI.getOperand(1).setReg(TmpReg);
+ }
+
+ continue;
+ }
+
if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
- unsigned SrcReg = MI.getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (!Register::isVirtualRegister(SrcReg)) {
TII->moveToVALU(MI, MDT);
break;
}
- MachineInstr *DefMI = MRI.getVRegDef(SrcReg);
+ MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
unsigned SMovOp;
int64_t Imm;
// If we are just copying an immediate, we can replace the copy with
@@ -651,70 +657,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
}
case AMDGPU::PHI: {
- unsigned Reg = MI.getOperand(0).getReg();
- if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
- break;
-
- // We don't need to fix the PHI if the common dominator of the
- // two incoming blocks terminates with a uniform branch.
- bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
- if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
- MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
- MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
-
- if (!predsHasDivergentTerminator(MBB0, TRI) &&
- !predsHasDivergentTerminator(MBB1, TRI)) {
- LLVM_DEBUG(dbgs()
- << "Not fixing PHI for uniform branch: " << MI << '\n');
- break;
- }
- }
-
- // If a PHI node defines an SGPR and any of its operands are VGPRs,
- // then we need to move it to the VALU.
- //
- // Also, if a PHI node defines an SGPR and has all SGPR operands
- // we must move it to the VALU, because the SGPR operands will
- // all end up being assigned the same register, which means
- // there is a potential for a conflict if different threads take
- // different control flow paths.
- //
- // For Example:
- //
- // sgpr0 = def;
- // ...
- // sgpr1 = def;
- // ...
- // sgpr2 = PHI sgpr0, sgpr1
- // use sgpr2;
- //
- // Will Become:
- //
- // sgpr2 = def;
- // ...
- // sgpr2 = def;
- // ...
- // use sgpr2
- //
- // The one exception to this rule is when one of the operands
- // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
- // instruction. In this case, there we know the program will
- // never enter the second block (the loop) without entering
- // the first block (where the condition is computed), so there
- // is no chance for values to be over-written.
-
- SmallSet<unsigned, 8> Visited;
- if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
- LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
- TII->moveToVALU(MI, MDT);
- }
-
+ processPHINode(MI);
break;
}
case AMDGPU::REG_SEQUENCE:
if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
!hasVectorOperands(MI, TRI)) {
- foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
+ foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
continue;
}
@@ -724,9 +673,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
case AMDGPU::INSERT_SUBREG: {
const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
- DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
- Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
- Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+ DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
+ Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
+ Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
if (TRI->isSGPRClass(DstRC) &&
(TRI->hasVectorRegisters(Src0RC) ||
TRI->hasVectorRegisters(Src1RC))) {
@@ -735,12 +684,159 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
}
break;
}
+ case AMDGPU::V_WRITELANE_B32: {
+ // Some architectures allow more than one constant bus access without
+ // SGPR restriction
+ if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
+ break;
+
+ // Writelane is special in that it can use SGPR and M0 (which would
+ // normally count as using the constant bus twice - but in this case it
+ // is allowed since the lane selector doesn't count as a use of the
+ // constant bus). However, it is still required to abide by the 1 SGPR
+ // rule. Apply a fix here as we might have multiple SGPRs after
+ // legalizing VGPRs to SGPRs
+ int Src0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+ int Src1Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ MachineOperand &Src1 = MI.getOperand(Src1Idx);
+
+ // Check to see if the instruction violates the 1 SGPR rule
+ if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) &&
+ Src0.getReg() != AMDGPU::M0) &&
+ (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) &&
+ Src1.getReg() != AMDGPU::M0)) {
+
+ // Check for trivially easy constant prop into one of the operands
+ // If this is the case then perform the operation now to resolve SGPR
+ // issue. If we don't do that here we will always insert a mov to m0
+ // that can't be resolved in later operand folding pass
+ bool Resolved = false;
+ for (MachineOperand *MO : {&Src0, &Src1}) {
+ if (Register::isVirtualRegister(MO->getReg())) {
+ MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());
+ if (DefMI && TII->isFoldableCopy(*DefMI)) {
+ const MachineOperand &Def = DefMI->getOperand(0);
+ if (Def.isReg() &&
+ MO->getReg() == Def.getReg() &&
+ MO->getSubReg() == Def.getSubReg()) {
+ const MachineOperand &Copied = DefMI->getOperand(1);
+ if (Copied.isImm() &&
+ TII->isInlineConstant(APInt(64, Copied.getImm(), true))) {
+ MO->ChangeToImmediate(Copied.getImm());
+ Resolved = true;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ if (!Resolved) {
+ // Haven't managed to resolve by replacing an SGPR with an immediate
+ // Move src1 to be in M0
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(AMDGPU::COPY), AMDGPU::M0)
+ .add(Src1);
+ Src1.ChangeToRegister(AMDGPU::M0, false);
+ }
+ }
+ break;
+ }
}
}
}
if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
- hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT, TII);
+ hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
return true;
}
+
+void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
+ unsigned numVGPRUses = 0;
+ bool AllAGPRUses = true;
+ SetVector<const MachineInstr *> worklist;
+ SmallSet<const MachineInstr *, 4> Visited;
+ worklist.insert(&MI);
+ Visited.insert(&MI);
+ while (!worklist.empty()) {
+ const MachineInstr *Instr = worklist.pop_back_val();
+ unsigned Reg = Instr->getOperand(0).getReg();
+ for (const auto &Use : MRI->use_operands(Reg)) {
+ const MachineInstr *UseMI = Use.getParent();
+ AllAGPRUses &= (UseMI->isCopy() &&
+ TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) ||
+ TRI->isAGPR(*MRI, Use.getReg());
+ if (UseMI->isCopy() || UseMI->isRegSequence()) {
+ if (UseMI->isCopy() &&
+ UseMI->getOperand(0).getReg().isPhysical() &&
+ !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) {
+ numVGPRUses++;
+ }
+ if (Visited.insert(UseMI).second)
+ worklist.insert(UseMI);
+
+ continue;
+ }
+
+ if (UseMI->isPHI()) {
+ const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg());
+ if (!TRI->isSGPRReg(*MRI, Use.getReg()) &&
+ UseRC != &AMDGPU::VReg_1RegClass)
+ numVGPRUses++;
+ continue;
+ }
+
+ const TargetRegisterClass *OpRC =
+ TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use));
+ if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
+ OpRC != &AMDGPU::VS_64RegClass) {
+ numVGPRUses++;
+ }
+ }
+ }
+
+ Register PHIRes = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
+ if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
+ LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
+ MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
+ }
+
+ bool hasVGPRInput = false;
+ for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+ unsigned InputReg = MI.getOperand(i).getReg();
+ MachineInstr *Def = MRI->getVRegDef(InputReg);
+ if (TRI->isVectorRegister(*MRI, InputReg)) {
+ if (Def->isCopy()) {
+ unsigned SrcReg = Def->getOperand(1).getReg();
+ const TargetRegisterClass *RC =
+ TRI->getRegClassForReg(*MRI, SrcReg);
+ if (TRI->isSGPRClass(RC))
+ continue;
+ }
+ hasVGPRInput = true;
+ break;
+ }
+ else if (Def->isCopy() &&
+ TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) {
+ hasVGPRInput = true;
+ break;
+ }
+ }
+
+ if ((!TRI->isVectorRegister(*MRI, PHIRes) &&
+ RC0 != &AMDGPU::VReg_1RegClass) &&
+ (hasVGPRInput || numVGPRUses > 1)) {
+ LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
+ TII->moveToVALU(MI);
+ }
+ else {
+ LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
+ TII->legalizeOperands(MI, MDT);
+ }
+
+}
diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
index 5b834c8de13a..a0119297b112 100644
--- a/lib/Target/AMDGPU/SIFixupVectorISel.cpp
+++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
@@ -91,8 +91,7 @@ static bool findSRegBaseAndIndex(MachineOperand *Op,
Worklist.push_back(Op);
while (!Worklist.empty()) {
MachineOperand *WOp = Worklist.pop_back_val();
- if (!WOp->isReg() ||
- !TargetRegisterInfo::isVirtualRegister(WOp->getReg()))
+ if (!WOp->isReg() || !Register::isVirtualRegister(WOp->getReg()))
continue;
MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg());
switch (DefInst->getOpcode()) {
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 74d77d328019..4eac03168760 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -142,16 +142,20 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
switch (Opc) {
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_F16_e64:
- case AMDGPU::V_FMAC_F32_e64: {
+ case AMDGPU::V_FMAC_F32_e64:
+ case AMDGPU::V_FMAC_F16_e64: {
// Special case for mac. Since this is replaced with mad when folded into
// src2, we need to check the legality for the final instruction.
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (static_cast<int>(OpNo) == Src2Idx) {
- bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
- bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64;
+ bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64;
unsigned Opc = IsFMA ?
- AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+ (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
+ (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
const MCInstrDesc &MadDesc = TII->get(Opc);
return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
}
@@ -235,9 +239,11 @@ static bool updateOperand(FoldCandidate &Fold,
if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
MachineBasicBlock *MBB = MI->getParent();
- auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
- if (Liveness != MachineBasicBlock::LQR_Dead)
+ auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI, 16);
+ if (Liveness != MachineBasicBlock::LQR_Dead) {
+ LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
return false;
+ }
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
int Op32 = Fold.getShrinkOpcode();
@@ -248,7 +254,7 @@ static bool updateOperand(FoldCandidate &Fold,
bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
- unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
+ Register NewReg0 = MRI.createVirtualRegister(Dst0RC);
MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
@@ -314,12 +320,15 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
unsigned Opc = MI->getOpcode();
if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64) &&
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
(int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
- bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
- bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64;
+ bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64;
unsigned NewOpc = IsFMA ?
- AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+ (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
+ (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
// to fold the operand.
@@ -435,7 +444,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST)
return false;
- if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) {
+ if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
+ TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
return true;
}
@@ -443,8 +453,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
if (!OpToFold.isReg())
return false;
- unsigned UseReg = OpToFold.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(UseReg))
+ Register UseReg = OpToFold.getReg();
+ if (!Register::isVirtualRegister(UseReg))
return false;
if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {
@@ -481,6 +491,9 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
return false; // Can only fold splat constants
}
+ if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op))
+ return false;
+
FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));
return true;
}
@@ -518,7 +531,7 @@ void SIFoldOperands::foldOperand(
// REG_SEQUENCE instructions, so we have to fold them into the
// uses of REG_SEQUENCE.
if (UseMI->isRegSequence()) {
- unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
+ Register RegSeqDstReg = UseMI->getOperand(0).getReg();
unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
MachineRegisterInfo::use_iterator Next;
@@ -569,15 +582,18 @@ void SIFoldOperands::foldOperand(
OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
if (FoldingImmLike && UseMI->isCopy()) {
- unsigned DestReg = UseMI->getOperand(0).getReg();
- const TargetRegisterClass *DestRC
- = TargetRegisterInfo::isVirtualRegister(DestReg) ?
- MRI->getRegClass(DestReg) :
- TRI->getPhysRegClass(DestReg);
-
- unsigned SrcReg = UseMI->getOperand(1).getReg();
- if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
- TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ Register DestReg = UseMI->getOperand(0).getReg();
+
+ // Don't fold into a copy to a physical register. Doing so would interfere
+ // with the register coalescer's logic which would avoid redundant
+ // initalizations.
+ if (DestReg.isPhysical())
+ return;
+
+ const TargetRegisterClass *DestRC = MRI->getRegClass(DestReg);
+
+ Register SrcReg = UseMI->getOperand(1).getReg();
+ if (SrcReg.isVirtual()) { // XXX - This can be an assert?
const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
MachineRegisterInfo::use_iterator NextUse;
@@ -613,10 +629,17 @@ void SIFoldOperands::foldOperand(
return;
UseMI->setDesc(TII->get(MovOp));
+ MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
+ MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
+ while (ImpOpI != ImpOpE) {
+ MachineInstr::mop_iterator Tmp = ImpOpI;
+ ImpOpI++;
+ UseMI->RemoveOperand(UseMI->getOperandNo(Tmp));
+ }
CopiesToReplace.push_back(UseMI);
} else {
if (UseMI->isCopy() && OpToFold.isReg() &&
- TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
+ Register::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) &&
!UseMI->getOperand(1).getSubReg()) {
@@ -677,6 +700,9 @@ void SIFoldOperands::foldOperand(
// =>
// %sgpr1 = COPY %sgpr0
UseMI->setDesc(TII->get(AMDGPU::COPY));
+ UseMI->getOperand(1).setReg(OpToFold.getReg());
+ UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
+ UseMI->getOperand(1).setIsKill(false);
UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
return;
}
@@ -708,7 +734,7 @@ void SIFoldOperands::foldOperand(
// Split 64-bit constants into 32-bits for folding.
if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
- unsigned UseReg = UseOp.getReg();
+ Register UseReg = UseOp.getReg();
const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
@@ -810,7 +836,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
if (Op.isReg()) {
// If this has a subregister, it obviously is a register source.
if (Op.getSubReg() != AMDGPU::NoSubRegister ||
- !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+ !Register::isVirtualRegister(Op.getReg()))
return &Op;
MachineInstr *Def = MRI.getVRegDef(Op.getReg());
@@ -1073,6 +1099,13 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
Copy->addImplicitDefUseOperands(*MF);
for (FoldCandidate &Fold : FoldList) {
+ if (Fold.isReg() && Register::isVirtualRegister(Fold.OpToFold->getReg())) {
+ Register Reg = Fold.OpToFold->getReg();
+ MachineInstr *DefMI = Fold.OpToFold->getParent();
+ if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
+ execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
+ continue;
+ }
if (updateOperand(Fold, *TII, *TRI, *ST)) {
// Clear kill flags.
if (Fold.isReg()) {
@@ -1316,6 +1349,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock *MBB : depth_first(&MF)) {
MachineBasicBlock::iterator I, Next;
+
+ MachineOperand *CurrentKnownM0Val = nullptr;
for (I = MBB->begin(); I != MBB->end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
@@ -1328,6 +1363,25 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
!tryFoldOMod(MI))
tryFoldClamp(MI);
+
+ // Saw an unknown clobber of m0, so we no longer know what it is.
+ if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
+ CurrentKnownM0Val = nullptr;
+ continue;
+ }
+
+ // Specially track simple redefs of m0 to the same value in a block, so we
+ // can erase the later ones.
+ if (MI.getOperand(0).getReg() == AMDGPU::M0) {
+ MachineOperand &NewM0Val = MI.getOperand(1);
+ if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
+ MI.eraseFromParent();
+ continue;
+ }
+
+ // We aren't tracking other physical registers
+ CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical()) ?
+ nullptr : &NewM0Val;
continue;
}
@@ -1339,8 +1393,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (!FoldingImm && !OpToFold.isReg())
continue;
- if (OpToFold.isReg() &&
- !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
+ if (OpToFold.isReg() && !Register::isVirtualRegister(OpToFold.getReg()))
continue;
// Prevent folding operands backwards in the function. For example,
@@ -1350,8 +1403,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
// ...
// %vgpr0 = V_MOV_B32_e32 1, implicit %exec
MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() &&
- !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+ if (Dst.isReg() && !Register::isVirtualRegister(Dst.getReg()))
continue;
foldInstOperand(MI, OpToFold);
diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index f3c9ad63a80a..26bae5734df7 100644
--- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -120,7 +120,7 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
return false;
// If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it.
for (const MachineOperand &ResMO : MI.defs()) {
- unsigned ResReg = ResMO.getReg();
+ Register ResReg = ResMO.getReg();
for (const MachineOperand &MO : MI.uses()) {
if (!MO.isReg() || MO.isDef())
continue;
@@ -144,7 +144,7 @@ static unsigned getMopState(const MachineOperand &MO) {
S |= RegState::Kill;
if (MO.isEarlyClobber())
S |= RegState::EarlyClobber;
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
+ if (Register::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
S |= RegState::Renamable;
return S;
}
@@ -152,7 +152,7 @@ static unsigned getMopState(const MachineOperand &MO) {
template <typename Callable>
void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask,
Callable Func) const {
- if (LaneMask.all() || TargetRegisterInfo::isPhysicalRegister(Reg) ||
+ if (LaneMask.all() || Register::isPhysicalRegister(Reg) ||
LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) {
Func(0);
return;
@@ -216,7 +216,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
// If it is tied we will need to write same register as we read.
if (MO.isTied())
@@ -227,7 +227,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
if (Conflict == Map.end())
continue;
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return false;
LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
@@ -265,13 +265,13 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (!Reg)
continue;
- LaneBitmask Mask = TargetRegisterInfo::isVirtualRegister(Reg) ?
- TRI->getSubRegIndexLaneMask(MO.getSubReg()) :
- LaneBitmask::getAll();
+ LaneBitmask Mask = Register::isVirtualRegister(Reg)
+ ? TRI->getSubRegIndexLaneMask(MO.getSubReg())
+ : LaneBitmask::getAll();
RegUse &Map = MO.isDef() ? Defs : Uses;
auto Loc = Map.find(Reg);
@@ -389,7 +389,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
for (auto &&R : Defs) {
unsigned Reg = R.first;
Uses.erase(Reg);
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
continue;
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
@@ -397,7 +397,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
for (auto &&R : Uses) {
unsigned Reg = R.first;
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
continue;
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index feab6bed2603..ed07ed100a19 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -112,6 +112,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.addMemOperand(MMO);
return;
}
@@ -132,6 +133,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.addMemOperand(MMO);
}
@@ -157,6 +159,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.addMemOperand(MMO);
return;
}
@@ -177,6 +180,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.addMemOperand(MMO);
}
@@ -202,15 +206,15 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
DebugLoc DL;
MachineBasicBlock::iterator I = MBB.begin();
- unsigned FlatScratchInitReg
- = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+ Register FlatScratchInitReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(FlatScratchInitReg);
MBB.addLiveIn(FlatScratchInitReg);
- unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
- unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+ Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+ Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
@@ -424,8 +428,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
// We need to insert initialization of the scratch resource descriptor.
- unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
- AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
if (ST.isAmdHsaOrMesa(F)) {
@@ -539,9 +543,9 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
if (ST.isAmdPalOS()) {
// The pointer to the GIT is formed from the offset passed in and either
// the amdgpu-git-ptr-high function attribute or the top part of the PC
- unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
- unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
- unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+ Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+ Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
@@ -601,14 +605,14 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
- unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
- unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+ Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+ Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
// Use relocations to get the pointer, and setup the other bits manually.
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
if (MFI->hasImplicitBufferPtr()) {
- unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+ Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
@@ -640,8 +644,8 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
}
} else {
- unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
- unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+ Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
BuildMI(MBB, I, DL, SMovB32, Rsrc0)
.addExternalSymbol("SCRATCH_RSRC_DWORD0")
@@ -669,6 +673,8 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
case TargetStackID::NoAlloc:
case TargetStackID::SGPRSpill:
return true;
+ case TargetStackID::SVEVector:
+ return false;
}
llvm_unreachable("Invalid TargetStackID::Value");
}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index c644f4726e2c..d9970fd6b4b8 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -20,9 +20,9 @@ class GCNSubtarget;
class SIFrameLowering final : public AMDGPUFrameLowering {
public:
- SIFrameLowering(StackDirection D, unsigned StackAl, int LAO,
- unsigned TransAl = 1) :
- AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+ SIFrameLowering(StackDirection D, Align StackAl, int LAO,
+ Align TransAl = Align::None())
+ : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
~SIFrameLowering() override = default;
void emitEntryFunctionPrologue(MachineFunction &MF,
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index db0782e2bf3e..56ebf9c06741 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -20,11 +20,11 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
@@ -35,6 +35,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/DAGCombine.h"
@@ -44,6 +45,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
@@ -115,7 +117,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
- addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
@@ -125,10 +127,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
- addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
- addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
+ addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
+ addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
- addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
+ addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
@@ -141,12 +143,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
if (Subtarget->has16BitInsts()) {
- addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
- addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
// Unless there are also VOP3P operations, not operations are really legal.
- addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
- addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
}
@@ -178,6 +180,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v32i32, Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+ setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
@@ -215,31 +218,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
-
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-
- setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
-
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
@@ -653,6 +635,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FADD, MVT::v4f16, Custom);
setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMA, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
@@ -687,6 +670,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, VT, Custom);
}
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
+
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::ADDCARRY);
setTargetDAGCombine(ISD::SUB);
@@ -768,19 +778,22 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- // TODO: Consider splitting all arguments into 32-bit pieces.
- if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
+ if (CC == CallingConv::AMDGPU_KERNEL)
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+
+ if (VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 32)
return ScalarVT.getSimpleVT();
- if (Size == 64)
+ if (Size > 32)
return MVT::i32;
if (Size == 16 && Subtarget->has16BitInsts())
return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
- }
+ } else if (VT.getSizeInBits() > 32)
+ return MVT::i32;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
@@ -788,7 +801,10 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
+ if (CC == CallingConv::AMDGPU_KERNEL)
+ return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+
+ if (VT.isVector()) {
unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
@@ -796,12 +812,13 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
if (Size == 32)
return NumElts;
- if (Size == 64)
- return 2 * NumElts;
+ if (Size > 32)
+ return NumElts * ((Size + 31) / 32);
if (Size == 16 && Subtarget->has16BitInsts())
- return (VT.getVectorNumElements() + 1) / 2;
- }
+ return (NumElts + 1) / 2;
+ } else if (VT.getSizeInBits() > 32)
+ return (VT.getSizeInBits() + 31) / 32;
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
@@ -821,10 +838,10 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
return NumIntermediates;
}
- if (Size == 64) {
+ if (Size > 32) {
RegisterVT = MVT::i32;
IntermediateVT = RegisterVT;
- NumIntermediates = 2 * NumElts;
+ NumIntermediates = NumElts * ((Size + 31) / 32);
return NumIntermediates;
}
@@ -901,7 +918,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = MFI->getImagePSV(
*MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
CI.getArgOperand(RsrcIntr->RsrcArg));
- Info.align = 0;
+ Info.align.reset();
} else {
Info.ptrVal = MFI->getBufferPSV(
*MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
@@ -947,7 +964,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
- Info.align = 0;
+ Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
@@ -964,7 +981,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = MFI->getBufferPSV(
*MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
CI.getArgOperand(1));
- Info.align = 0;
+ Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
@@ -978,7 +995,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(CI.getOperand(0)->getType()
->getPointerElementType());
Info.ptrVal = CI.getOperand(0);
- Info.align = 0;
+ Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
@@ -988,7 +1005,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
- Info.align = 0;
+ Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
@@ -1012,7 +1029,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
// This is an abstract access, but we need to specify a type and size.
Info.memVT = MVT::i32;
Info.size = 4;
- Info.align = 4;
+ Info.align = Align(4);
Info.flags = MachineMemOperand::MOStore;
if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
@@ -1215,21 +1232,12 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
return true;
}
-bool SITargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
- bool *IsFast) const {
+bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
+ unsigned Size, unsigned AddrSpace, unsigned Align,
+ MachineMemOperand::Flags Flags, bool *IsFast) const {
if (IsFast)
*IsFast = false;
- // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
- // which isn't a simple VT.
- // Until MVT is extended to handle this, simply check for the size and
- // rely on the condition below: allow accesses if the size is a multiple of 4.
- if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
- VT.getStoreSize() > 16)) {
- return false;
- }
-
if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
// ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
@@ -1268,7 +1276,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
}
// Smaller than dword value must be aligned.
- if (VT.bitsLT(MVT::i32))
+ if (Size < 32)
return false;
// 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
@@ -1277,7 +1285,26 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
if (IsFast)
*IsFast = true;
- return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+ return Size >= 32 && Align >= 4;
+}
+
+bool SITargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *IsFast) const {
+ if (IsFast)
+ *IsFast = false;
+
+ // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
+ // which isn't a simple VT.
+ // Until MVT is extended to handle this, simply check for the size and
+ // rely on the condition below: allow accesses if the size is a multiple of 4.
+ if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
+ VT.getStoreSize() > 16)) {
+ return false;
+ }
+
+ return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
+ Align, Flags, IsFast);
}
EVT SITargetLowering::getOptimalMemOpType(
@@ -1336,9 +1363,9 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
TargetLoweringBase::LegalizeTypeAction
SITargetLowering::getPreferredVectorAction(MVT VT) const {
- if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
- return TypeSplitVector;
-
+ int NumElts = VT.getVectorNumElements();
+ if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16))
+ return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
@@ -1562,7 +1589,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
// entire split argument.
if (Arg->Flags.isSplit()) {
while (!Arg->Flags.isSplitEnd()) {
- assert(!Arg->VT.isVector() &&
+ assert((!Arg->VT.isVector() ||
+ Arg->VT.getScalarSizeInBits() == 16) &&
"unexpected vector split in ps argument type");
if (!SkipArg)
Splits.push_back(*Arg);
@@ -1589,29 +1617,32 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
}
// Allocate special inputs passed in VGPRs.
-static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
+void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const {
+ const LLT S32 = LLT::scalar(32);
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
if (Info.hasWorkItemIDX()) {
- unsigned Reg = AMDGPU::VGPR0;
- MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ Register Reg = AMDGPU::VGPR0;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDY()) {
- unsigned Reg = AMDGPU::VGPR1;
- MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ Register Reg = AMDGPU::VGPR1;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDZ()) {
- unsigned Reg = AMDGPU::VGPR2;
- MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ Register Reg = AMDGPU::VGPR2;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
@@ -1642,7 +1673,8 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
assert(Reg != AMDGPU::NoRegister);
MachineFunction &MF = CCInfo.getMachineFunction();
- MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
return ArgDescriptor::createRegister(Reg, Mask);
}
@@ -1671,10 +1703,10 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
}
-static void allocateSpecialInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
+void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const {
const unsigned Mask = 0x3ff;
ArgDescriptor Arg;
@@ -1692,10 +1724,11 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo,
Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
}
-static void allocateSpecialInputSGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
+void SITargetLowering::allocateSpecialInputSGPRs(
+ CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const {
auto &ArgInfo = Info.getArgInfo();
// TODO: Unify handling with private memory pointers.
@@ -1728,10 +1761,10 @@ static void allocateSpecialInputSGPRs(CCState &CCInfo,
}
// Allocate special inputs passed in user SGPRs.
-static void allocateHSAUserSGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
+void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const {
if (Info.hasImplicitBufferPtr()) {
unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
@@ -1758,9 +1791,12 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
}
if (Info.hasKernargSegmentPtr()) {
- unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
- MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
CCInfo.AllocateReg(InputPtrReg);
+
+ Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
+ MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
}
if (Info.hasDispatchID()) {
@@ -1780,32 +1816,32 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
}
// Allocate special input registers that are initialized per-wave.
-static void allocateSystemSGPRs(CCState &CCInfo,
- MachineFunction &MF,
- SIMachineFunctionInfo &Info,
- CallingConv::ID CallConv,
- bool IsShader) {
+void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ SIMachineFunctionInfo &Info,
+ CallingConv::ID CallConv,
+ bool IsShader) const {
if (Info.hasWorkGroupIDX()) {
unsigned Reg = Info.addWorkGroupIDX();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupIDY()) {
unsigned Reg = Info.addWorkGroupIDY();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupIDZ()) {
unsigned Reg = Info.addWorkGroupIDZ();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupInfo()) {
unsigned Reg = Info.addWorkGroupInfo();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
@@ -1860,7 +1896,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
- unsigned PrivateSegmentBufferReg =
+ Register PrivateSegmentBufferReg =
Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
} else {
@@ -1921,7 +1957,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
//
// FIXME: Should not do this if inline asm is reading/writing these
// registers.
- unsigned PreloadedSP = Info.getPreloadedReg(
+ Register PreloadedSP = Info.getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setStackPtrOffsetReg(PreloadedSP);
@@ -1971,7 +2007,7 @@ void SITargetLowering::insertCopiesSplitCSR(
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
- unsigned NewVR = MRI->createVirtualRegister(RC);
+ Register NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
Entry->addLiveIn(*I);
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
@@ -2134,7 +2170,7 @@ SDValue SITargetLowering::LowerFormalArguments(
assert(VA.isRegLoc() && "Parameter must be in a register!");
- unsigned Reg = VA.getLocReg();
+ Register Reg = VA.getLocReg();
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
EVT ValVT = VA.getValVT();
@@ -2652,6 +2688,15 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
bool IsThisReturn = false;
MachineFunction &MF = DAG.getMachineFunction();
+ if (Callee.isUndef() || isNullConstant(Callee)) {
+ if (!CLI.IsTailCall) {
+ for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
+ InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
+ }
+
+ return Chain;
+ }
+
if (IsVarArg) {
return lowerUnhandledCall(CLI, InVals,
"unsupported call to variadic function ");
@@ -2782,7 +2827,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
int32_t Offset = LocMemOffset;
SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
- unsigned Align = 0;
+ MaybeAlign Alignment;
if (IsTailCall) {
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
@@ -2790,8 +2835,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Flags.getByValSize() : VA.getValVT().getStoreSize();
// FIXME: We can have better than the minimum byval required alignment.
- Align = Flags.isByVal() ? Flags.getByValAlign() :
- MinAlign(Subtarget->getStackAlignment(), Offset);
+ Alignment =
+ Flags.isByVal()
+ ? MaybeAlign(Flags.getByValAlign())
+ : commonAlignment(Subtarget->getStackAlignment(), Offset);
Offset = Offset + FPDiff;
int FI = MFI.CreateFixedObject(OpSize, Offset, true);
@@ -2810,7 +2857,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
} else {
DstAddr = PtrOff;
DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
- Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
+ Alignment =
+ commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
}
if (Outs[i].Flags.isByVal()) {
@@ -2825,7 +2873,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
MemOpChains.push_back(Cpy);
} else {
- SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
+ SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo,
+ Alignment ? Alignment->value() : 0);
MemOpChains.push_back(Store);
}
}
@@ -2937,9 +2986,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
IsThisReturn ? OutVals[0] : SDValue());
}
-unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const {
- unsigned Reg = StringSwitch<unsigned>(RegName)
+Register SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const {
+ Register Reg = StringSwitch<Register>(RegName)
.Case("m0", AMDGPU::M0)
.Case("exec", AMDGPU::EXEC)
.Case("exec_lo", AMDGPU::EXEC_LO)
@@ -2947,7 +2996,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
.Case("flat_scratch", AMDGPU::FLAT_SCR)
.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
.Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
- .Default(AMDGPU::NoRegister);
+ .Default(Register());
if (Reg == AMDGPU::NoRegister) {
report_fatal_error(Twine("invalid register name \""
@@ -3055,6 +3104,20 @@ splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
return std::make_pair(LoopBB, RemainderBB);
}
+/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
+void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ auto I = MI.getIterator();
+ auto E = std::next(I);
+
+ BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+ .addImm(0);
+
+ MIBundleBuilder Bundler(*MBB, I, E);
+ finalizeBundle(*MBB, Bundler.begin());
+}
+
MachineBasicBlock *
SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
MachineBasicBlock *BB) const {
@@ -3066,12 +3129,13 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
MachineBasicBlock *RemainderBB;
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- MachineBasicBlock::iterator Prev = std::prev(MI.getIterator());
+ // Apparently kill flags are only valid if the def is in the same block?
+ if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
+ Src->setIsKill(false);
std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
MachineBasicBlock::iterator I = LoopBB->end();
- MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
@@ -3081,23 +3145,9 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
.addImm(0)
.addImm(EncodedReg);
- // This is a pain, but we're not allowed to have physical register live-ins
- // yet. Insert a pair of copies if the VGPR0 hack is necessary.
- if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) {
- unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0)
- .add(*Src);
+ bundleInstWithWaitcnt(MI);
- BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg())
- .addReg(Data0);
-
- MRI.setSimpleHint(Data0, Src->getReg());
- }
-
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT))
- .addImm(0);
-
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
// Load and check TRAP_STS.MEM_VIOL
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
@@ -3138,10 +3188,10 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
MachineBasicBlock::iterator I = LoopBB.begin();
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
- unsigned PhiExec = MRI.createVirtualRegister(BoolRC);
- unsigned NewExec = MRI.createVirtualRegister(BoolRC);
- unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned CondReg = MRI.createVirtualRegister(BoolRC);
+ Register PhiExec = MRI.createVirtualRegister(BoolRC);
+ Register NewExec = MRI.createVirtualRegister(BoolRC);
+ Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CondReg = MRI.createVirtualRegister(BoolRC);
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
.addReg(InitReg)
@@ -3240,9 +3290,9 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
MachineBasicBlock::iterator I(&MI);
const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
- unsigned TmpExec = MRI.createVirtualRegister(BoolXExecRC);
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+ Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -3315,7 +3365,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
SetOn->getOperand(3).setIsUndef();
} else {
- unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
.add(*Idx)
.addImm(Offset);
@@ -3351,8 +3401,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
@@ -3390,8 +3440,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
- unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
@@ -3442,7 +3492,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned Dst = MI.getOperand(0).getReg();
+ Register Dst = MI.getOperand(0).getReg();
const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
@@ -3505,7 +3555,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
const DebugLoc &DL = MI.getDebugLoc();
- unsigned PhiReg = MRI.createVirtualRegister(VecRC);
+ Register PhiReg = MRI.createVirtualRegister(VecRC);
auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
Offset, UseGPRIdxMode, false);
@@ -3564,22 +3614,22 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
- unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
Src0, BoolRC, AMDGPU::sub0,
- &AMDGPU::SReg_32_XM0RegClass);
+ &AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
Src0, BoolRC, AMDGPU::sub1,
- &AMDGPU::SReg_32_XM0RegClass);
+ &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
Src1, BoolRC, AMDGPU::sub0,
- &AMDGPU::SReg_32_XM0RegClass);
+ &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
Src1, BoolRC, AMDGPU::sub1,
- &AMDGPU::SReg_32_XM0RegClass);
+ &AMDGPU::SReg_32RegClass);
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
@@ -3632,8 +3682,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
// S_CMOV_B64 exec, -1
MachineInstr *FirstMI = &*BB->begin();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned InputReg = MI.getOperand(0).getReg();
- unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register InputReg = MI.getOperand(0).getReg();
+ Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
bool Found = false;
// Move the COPY of the input reg to the beginning, so that we can use it.
@@ -3707,16 +3757,16 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src0 = MI.getOperand(1).getReg();
- unsigned Src1 = MI.getOperand(2).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Src1 = MI.getOperand(2).getReg();
const DebugLoc &DL = MI.getDebugLoc();
- unsigned SrcCond = MI.getOperand(3).getReg();
+ Register SrcCond = MI.getOperand(3).getReg();
- unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned SrcCondCopy = MRI.createVirtualRegister(CondRC);
+ Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
.addReg(SrcCond);
@@ -3814,8 +3864,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::DS_GWS_SEMA_P:
case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
case AMDGPU::DS_GWS_BARRIER:
- if (getSubtarget()->hasGWSAutoReplay())
+ // A s_waitcnt 0 is required to be the instruction immediately following.
+ if (getSubtarget()->hasGWSAutoReplay()) {
+ bundleInstWithWaitcnt(MI);
return BB;
+ }
+
return emitGWSMemViolTestLoop(MI, BB);
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
@@ -3939,6 +3993,30 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
}
+SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+
+ SDValue Lo0, Hi0;
+ std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
+ SDValue Lo1, Hi1;
+ std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
+ SDValue Lo2, Hi2;
+ std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
+
+ SDLoc SL(Op);
+
+ SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2,
+ Op->getFlags());
+ SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2,
+ Op->getFlags());
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
+
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@@ -3991,6 +4069,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMINNUM:
case ISD::FMAXNUM:
return lowerFMINNUM_FMAXNUM(Op, DAG);
+ case ISD::FMA:
+ return splitTernaryVectorOp(Op, DAG);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
@@ -4070,6 +4150,41 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
}
+SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
+ SelectionDAG &DAG,
+ ArrayRef<SDValue> Ops) const {
+ SDLoc DL(M);
+ EVT LoadVT = M->getValueType(0);
+ EVT EltType = LoadVT.getScalarType();
+ EVT IntVT = LoadVT.changeTypeToInteger();
+
+ bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
+
+ unsigned Opc =
+ IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD;
+
+ if (IsD16) {
+ return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
+ }
+
+ // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
+ if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+
+ if (isTypeLegal(LoadVT)) {
+ return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
+ M->getMemOperand(), DAG);
+ }
+
+ EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
+ SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
+ SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
+ M->getMemOperand(), DAG);
+ return DAG.getMergeValues(
+ {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
+ DL);
+}
+
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
@@ -4196,8 +4311,14 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::INTRINSIC_W_CHAIN: {
if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
- Results.push_back(Res);
- Results.push_back(Res.getValue(1));
+ if (Res.getOpcode() == ISD::MERGE_VALUES) {
+ // FIXME: Hacky
+ Results.push_back(Res.getOperand(0));
+ Results.push_back(Res.getOperand(1));
+ } else {
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ }
return;
}
@@ -4935,11 +5056,8 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
// small. This requires us to add 4 to the global variable offset in order to
// compute the correct address.
- unsigned LoFlags = GAFlags;
- if (LoFlags == SIInstrInfo::MO_NONE)
- LoFlags = SIInstrInfo::MO_REL32;
SDValue PtrLo =
- DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, LoFlags);
+ DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
SDValue PtrHi;
if (GAFlags == SIInstrInfo::MO_NONE) {
PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
@@ -5563,14 +5681,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
SDValue Ops[] = {
- DAG.getEntryNode(), // Chain
- Rsrc, // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- {}, // voffset
- {}, // soffset
- {}, // offset
- DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
+ DAG.getEntryNode(), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ {}, // voffset
+ {}, // soffset
+ {}, // offset
+ DAG.getTargetConstant(CachePolicy, DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
// Use the alignment to ensure that the required offsets will fit into the
@@ -5579,7 +5697,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
for (unsigned i = 0; i < NumLoads; ++i) {
- Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
+ Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
Ops, LoadVT, MMO));
}
@@ -5758,45 +5876,31 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_fdiv_fast:
return lowerFDIV_FAST(Op, DAG);
- case Intrinsic::amdgcn_interp_mov: {
- SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
- SDValue Glue = M0.getValue(1);
- return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
- Op.getOperand(2), Op.getOperand(3), Glue);
- }
- case Intrinsic::amdgcn_interp_p1: {
- SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
- SDValue Glue = M0.getValue(1);
- return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
- Op.getOperand(2), Op.getOperand(3), Glue);
- }
- case Intrinsic::amdgcn_interp_p2: {
- SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
- SDValue Glue = SDValue(M0.getNode(), 1);
- return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
- Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
- Glue);
- }
case Intrinsic::amdgcn_interp_p1_f16: {
- SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
- SDValue Glue = M0.getValue(1);
+ SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0,
+ Op.getOperand(5), SDValue());
if (getSubtarget()->getLDSBankCount() == 16) {
// 16 bank LDS
- SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
- DAG.getConstant(2, DL, MVT::i32), // P0
- Op.getOperand(2), // Attrchan
- Op.getOperand(3), // Attr
- Glue);
+
+ // FIXME: This implicitly will insert a second CopyToReg to M0.
+ SDValue S = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
+ DAG.getTargetConstant(Intrinsic::amdgcn_interp_mov, DL, MVT::i32),
+ DAG.getConstant(2, DL, MVT::i32), // P0
+ Op.getOperand(2), // Attrchan
+ Op.getOperand(3), // Attr
+ Op.getOperand(5)); // m0
+
SDValue Ops[] = {
Op.getOperand(1), // Src0
Op.getOperand(2), // Attrchan
Op.getOperand(3), // Attr
- DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+ DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
S, // Src2 - holds two f16 values selected by high
- DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+ DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
Op.getOperand(4), // high
- DAG.getConstant(0, DL, MVT::i1), // $clamp
- DAG.getConstant(0, DL, MVT::i32) // $omod
+ DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
+ DAG.getTargetConstant(0, DL, MVT::i32) // $omod
};
return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
} else {
@@ -5805,28 +5909,28 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1), // Src0
Op.getOperand(2), // Attrchan
Op.getOperand(3), // Attr
- DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+ DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
Op.getOperand(4), // high
- DAG.getConstant(0, DL, MVT::i1), // $clamp
- DAG.getConstant(0, DL, MVT::i32), // $omod
- Glue
+ DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
+ DAG.getTargetConstant(0, DL, MVT::i32), // $omod
+ ToM0.getValue(1)
};
return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
}
}
case Intrinsic::amdgcn_interp_p2_f16: {
- SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
- SDValue Glue = SDValue(M0.getNode(), 1);
+ SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0,
+ Op.getOperand(6), SDValue());
SDValue Ops[] = {
Op.getOperand(2), // Src0
Op.getOperand(3), // Attrchan
Op.getOperand(4), // Attr
- DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+ DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
Op.getOperand(1), // Src2
- DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+ DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
Op.getOperand(5), // high
- DAG.getConstant(0, DL, MVT::i1), // $clamp
- Glue
+ DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
+ ToM0.getValue(1)
};
return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
}
@@ -5947,16 +6051,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}
- case Intrinsic::amdgcn_wqm: {
- SDValue Src = Op.getOperand(1);
- return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
- 0);
- }
- case Intrinsic::amdgcn_wwm: {
- SDValue Src = Op.getOperand(1);
- return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
- 0);
- }
case Intrinsic::amdgcn_fmad_ftz:
return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
@@ -5977,6 +6071,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SIInstrInfo::MO_ABS32_LO);
return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
}
+ case Intrinsic::amdgcn_is_shared:
+ case Intrinsic::amdgcn_is_private: {
+ SDLoc SL(Op);
+ unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
+ AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
+ SDValue Aperture = getSegmentAperture(AS, SL, DAG);
+ SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
+ Op.getOperand(1));
+
+ SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
+ DAG.getConstant(1, SL, MVT::i32));
+ return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -5986,6 +6093,30 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
+// This function computes an appropriate offset to pass to
+// MachineMemOperand::setOffset() based on the offset inputs to
+// an intrinsic. If any of the offsets are non-contstant or
+// if VIndex is non-zero then this function returns 0. Otherwise,
+// it returns the sum of VOffset, SOffset, and Offset.
+static unsigned getBufferOffsetForMMO(SDValue VOffset,
+ SDValue SOffset,
+ SDValue Offset,
+ SDValue VIndex = SDValue()) {
+
+ if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) ||
+ !isa<ConstantSDNode>(Offset))
+ return 0;
+
+ if (VIndex) {
+ if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue())
+ return 0;
+ }
+
+ return cast<ConstantSDNode>(VOffset)->getSExtValue() +
+ cast<ConstantSDNode>(SOffset)->getSExtValue() +
+ cast<ConstantSDNode>(Offset)->getSExtValue();
+}
+
SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
@@ -6128,17 +6259,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue(), // voffset -- will be set by setBufferOffsets
SDValue(), // soffset -- will be set by setBufferOffsets
SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
+ unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
+ // We don't know the offset if vindex is non-zero, so clear it.
+ if (IdxEn)
+ Offset = 0;
+
unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
EVT VT = Op.getValueType();
EVT IntVT = VT.changeTypeToInteger();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(Offset);
EVT LoadVT = Op.getValueType();
if (LoadVT.getScalarType() == MVT::f16)
@@ -6155,6 +6291,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format: {
+ const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format;
+
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
@@ -6163,32 +6301,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Offsets.first, // voffset
Op.getOperand(4), // soffset
Offsets.second, // offset
- Op.getOperand(5), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
+ Op.getOperand(5), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
- unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
- AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
-
- EVT VT = Op.getValueType();
- EVT IntVT = VT.changeTypeToInteger();
auto *M = cast<MemSDNode>(Op);
- EVT LoadVT = Op.getValueType();
-
- if (LoadVT.getScalarType() == MVT::f16)
- return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
- M, DAG, Ops);
-
- // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
- if (LoadVT.getScalarType() == MVT::i8 ||
- LoadVT.getScalarType() == MVT::i16)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
-
- return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand(), DAG);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5]));
+ return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load_format: {
+ const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format;
+
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
@@ -6197,29 +6321,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Offsets.first, // voffset
Op.getOperand(5), // soffset
Offsets.second, // offset
- Op.getOperand(6), // cachepolicy
- DAG.getConstant(1, DL, MVT::i1), // idxen
+ Op.getOperand(6), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
- unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
- AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
-
- EVT VT = Op.getValueType();
- EVT IntVT = VT.changeTypeToInteger();
auto *M = cast<MemSDNode>(Op);
- EVT LoadVT = Op.getValueType();
-
- if (LoadVT.getScalarType() == MVT::f16)
- return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
- M, DAG, Ops);
-
- // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
- if (LoadVT.getScalarType() == MVT::i8 ||
- LoadVT.getScalarType() == MVT::i16)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
-
- return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand(), DAG);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5],
+ Ops[2]));
+ return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -6239,9 +6348,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(4), // voffset
Op.getOperand(5), // soffset
Op.getOperand(6), // offset
- DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -6264,8 +6373,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(4), // soffset
Offsets.second, // offset
Op.getOperand(5), // format
- Op.getOperand(6), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
+ Op.getOperand(6), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -6288,8 +6397,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(5), // soffset
Offsets.second, // offset
Op.getOperand(6), // format
- Op.getOperand(7), // cachepolicy
- DAG.getConstant(1, DL, MVT::i1), // idxen
+ Op.getOperand(7), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -6321,13 +6430,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue(), // voffset -- will be set by setBufferOffsets
SDValue(), // soffset -- will be set by setBufferOffsets
SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ // We don't know the offset if vindex is non-zero, so clear it.
+ if (IdxEn)
+ Offset = 0;
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(Offset);
unsigned Opcode = 0;
switch (IntrID) {
@@ -6377,7 +6490,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
case Intrinsic::amdgcn_raw_buffer_atomic_and:
case Intrinsic::amdgcn_raw_buffer_atomic_or:
- case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
+ case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+ case Intrinsic::amdgcn_raw_buffer_atomic_dec: {
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
@@ -6388,11 +6503,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(5), // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
unsigned Opcode = 0;
switch (IntrID) {
@@ -6426,6 +6542,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
+ break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
+ break;
default:
llvm_unreachable("unhandled atomic opcode");
}
@@ -6442,7 +6564,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_buffer_atomic_and:
case Intrinsic::amdgcn_struct_buffer_atomic_or:
- case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
+ case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+ case Intrinsic::amdgcn_struct_buffer_atomic_dec: {
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
@@ -6453,11 +6577,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(6), // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
- DAG.getConstant(1, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
+ Ops[3]));
unsigned Opcode = 0;
switch (IntrID) {
@@ -6491,6 +6617,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
+ break;
default:
llvm_unreachable("unhandled atomic opcode");
}
@@ -6512,12 +6644,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue(), // voffset -- will be set by setBufferOffsets
SDValue(), // soffset -- will be set by setBufferOffsets
SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
+ unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
+ // We don't know the offset if vindex is non-zero, so clear it.
+ if (IdxEn)
+ Offset = 0;
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(Offset);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -6534,10 +6670,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(6), // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7]));
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -6554,10 +6691,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(7), // soffset
Offsets.second, // offset
Op.getOperand(8), // cachepolicy
- DAG.getConstant(1, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7],
+ Ops[4]));
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -6686,23 +6825,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
}
- case Intrinsic::amdgcn_s_sendmsg:
- case Intrinsic::amdgcn_s_sendmsghalt: {
- unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
- AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
- Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
- SDValue Glue = Chain.getValue(1);
- return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
- Op.getOperand(2), Glue);
- }
- case Intrinsic::amdgcn_init_exec: {
- return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
- Op.getOperand(2));
- }
- case Intrinsic::amdgcn_init_exec_from_input: {
- return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
- Op.getOperand(2), Op.getOperand(3));
- }
case Intrinsic::amdgcn_s_barrier: {
if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -6733,9 +6855,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(5), // voffset
Op.getOperand(6), // soffset
Op.getOperand(7), // offset
- DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
+ DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -6759,8 +6881,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(6), // soffset
Offsets.second, // offset
Op.getOperand(7), // format
- Op.getOperand(8), // cachepolicy
- DAG.getConstant(1, DL, MVT::i1), // idexen
+ Op.getOperand(8), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(1, DL, MVT::i1), // idexen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -6784,8 +6906,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(5), // soffset
Offsets.second, // offset
Op.getOperand(6), // format
- Op.getOperand(7), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idexen
+ Op.getOperand(7), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(0, DL, MVT::i1), // idexen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -6813,14 +6935,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue(), // voffset -- will be set by setBufferOffsets
SDValue(), // soffset -- will be set by setBufferOffsets
SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ // We don't know the offset if vindex is non-zero, so clear it.
+ if (IdxEn)
+ Offset = 0;
unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(Offset);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
@@ -6833,10 +6959,22 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_raw_buffer_store_format: {
+ const bool IsFormat =
+ IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format;
+
SDValue VData = Op.getOperand(2);
- bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ EVT VDataVT = VData.getValueType();
+ EVT EltType = VDataVT.getScalarType();
+ bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
if (IsD16)
VData = handleD16VData(VData, DAG);
+
+ if (!isTypeLegal(VDataVT)) {
+ VData =
+ DAG.getNode(ISD::BITCAST, DL,
+ getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
+ }
+
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
Chain,
@@ -6846,18 +6984,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Offsets.first, // voffset
Op.getOperand(5), // soffset
Offsets.second, // offset
- Op.getOperand(6), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
+ Op.getOperand(6), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
- unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
- AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+ unsigned Opc =
+ IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
- EVT VDataType = VData.getValueType().getScalarType();
- if (VDataType == MVT::i8 || VDataType == MVT::i16)
- return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
+ if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
+ return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
@@ -6865,10 +7003,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_store:
case Intrinsic::amdgcn_struct_buffer_store_format: {
+ const bool IsFormat =
+ IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format;
+
SDValue VData = Op.getOperand(2);
- bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ EVT VDataVT = VData.getValueType();
+ EVT EltType = VDataVT.getScalarType();
+ bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
+
if (IsD16)
VData = handleD16VData(VData, DAG);
+
+ if (!isTypeLegal(VDataVT)) {
+ VData =
+ DAG.getNode(ISD::BITCAST, DL,
+ getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
+ }
+
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
SDValue Ops[] = {
Chain,
@@ -6878,17 +7029,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Offsets.first, // voffset
Op.getOperand(6), // soffset
Offsets.second, // offset
- Op.getOperand(7), // cachepolicy
- DAG.getConstant(1, DL, MVT::i1), // idxen
+ Op.getOperand(7), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
+ Ops[3]));
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
- if (VDataType == MVT::i8 || VDataType == MVT::i16)
+ if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
@@ -6908,13 +7061,17 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue(), // voffset -- will be set by setBufferOffsets
SDValue(), // soffset -- will be set by setBufferOffsets
SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ // We don't know the offset if vindex is non-zero, so clear it.
+ if (IdxEn)
+ Offset = 0;
EVT VT = Op.getOperand(2).getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(Offset);
unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
: AMDGPUISD::BUFFER_ATOMIC_FADD;
@@ -6987,7 +7144,7 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
Overflow += ImmOffset;
ImmOffset = 0;
}
- C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
+ C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
if (Overflow) {
auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
if (!N0)
@@ -7001,14 +7158,14 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
if (!N0)
N0 = DAG.getConstant(0, DL, MVT::i32);
if (!C1)
- C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
+ C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
return {N0, SDValue(C1, 0)};
}
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
-void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
+unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
SelectionDAG &DAG, SDValue *Offsets,
unsigned Align) const {
SDLoc DL(CombinedOffset);
@@ -7018,8 +7175,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
- Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
- return;
+ Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
+ return SOffset + ImmOffset;
}
}
if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
@@ -7031,13 +7188,14 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Subtarget, Align)) {
Offsets[0] = N0;
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
- Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
- return;
+ Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
+ return 0;
}
}
Offsets[0] = CombinedOffset;
Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
- Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
+ Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
+ return 0;
}
// Handle 8 bit and 16 bit buffer loads
@@ -7053,9 +7211,10 @@ SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
Ops, IntVT,
M->getMemOperand());
- SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL,
- LoadVT.getScalarType(), BufferLoad);
- return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL);
+ SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
+ LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
+
+ return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
}
// Handle 8 bit and 16 bit buffer stores
@@ -7063,6 +7222,9 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
EVT VDataType, SDLoc DL,
SDValue Ops[],
MemSDNode *M) const {
+ if (VDataType == MVT::f16)
+ Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
+
SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
Ops[1] = BufferStoreExt;
unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
@@ -7215,8 +7377,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented.");
- if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
- *Load->getMemOperand())) {
+ if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ MemVT, *Load->getMemOperand())) {
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues(Ops, DL);
@@ -7505,6 +7667,19 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
}
+// Returns immediate value for setting the F32 denorm mode when using the
+// S_DENORM_MODE instruction.
+static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
+ const SDLoc &SL, const GCNSubtarget *ST) {
+ assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
+ int DPDenormModeDefault = ST->hasFP64Denormals()
+ ? FP_DENORM_FLUSH_NONE
+ : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+ int Mode = SPDenormMode | (DPDenormModeDefault << 2);
+ return DAG.getTargetConstant(Mode, SL, MVT::i32);
+}
+
SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
return FastLowered;
@@ -7531,16 +7706,26 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
(4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
(1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
-
const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
if (!Subtarget->hasFP32Denormals()) {
SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
- SL, MVT::i32);
- SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
- DAG.getEntryNode(),
- EnableDenormValue, BitField);
+
+ SDValue EnableDenorm;
+ if (Subtarget->hasDenormModeInst()) {
+ const SDValue EnableDenormValue =
+ getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget);
+
+ EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
+ DAG.getEntryNode(), EnableDenormValue);
+ } else {
+ const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
+ SL, MVT::i32);
+ EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
+ DAG.getEntryNode(), EnableDenormValue,
+ BitField);
+ }
+
SDValue Ops[3] = {
NegDivScale0,
EnableDenorm.getValue(0),
@@ -7562,19 +7747,29 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
NumeratorScaled, Mul);
- SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
+ SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
NumeratorScaled, Fma3);
if (!Subtarget->hasFP32Denormals()) {
- const SDValue DisableDenormValue =
- DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
- SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
- Fma4.getValue(1),
- DisableDenormValue,
- BitField,
- Fma4.getValue(2));
+
+ SDValue DisableDenorm;
+ if (Subtarget->hasDenormModeInst()) {
+ const SDValue DisableDenormValue =
+ getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget);
+
+ DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
+ Fma4.getValue(1), DisableDenormValue,
+ Fma4.getValue(2));
+ } else {
+ const SDValue DisableDenormValue =
+ DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
+
+ DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
+ Fma4.getValue(1), DisableDenormValue,
+ BitField, Fma4.getValue(2));
+ }
SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
DisableDenorm, DAG.getRoot());
@@ -7684,8 +7879,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.isVector() &&
Store->getValue().getValueType().getScalarType() == MVT::i32);
- if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- *Store->getMemOperand())) {
+ if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ VT, *Store->getMemOperand())) {
return expandUnalignedStore(Store, DAG);
}
@@ -10065,7 +10260,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
// Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
// to try understanding copies to physical registers.
if (SrcVal.getValueType() == MVT::i1 &&
- TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
+ Register::isPhysicalRegister(DestReg->getReg())) {
SDLoc SL(Node);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
SDValue VReg = DAG.getRegister(
@@ -10218,7 +10413,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
MachineOperand &Op = MI.getOperand(I);
if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID &&
OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) ||
- !TargetRegisterInfo::isVirtualRegister(Op.getReg()) ||
+ !Register::isVirtualRegister(Op.getReg()) ||
!TRI->isAGPR(MRI, Op.getReg()))
continue;
auto *Src = MRI.getUniqueVRegDef(Op.getReg());
@@ -10256,7 +10451,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
Node->use_begin()->isMachineOpcode() &&
Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
!Node->use_begin()->hasAnyUseOfValue(0))) {
- unsigned Def = MI.getOperand(0).getReg();
+ Register Def = MI.getOperand(0).getReg();
// Change this into a noret atomic.
MI.setDesc(TII->get(NoRetAtomicOp));
@@ -10300,7 +10495,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
// Combine the constants and the pointer.
const SDValue Ops1[] = {
- DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+ DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
Ptr,
DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
SubRegHi,
@@ -10330,7 +10525,7 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
const SDValue Ops[] = {
- DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+ DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
PtrLo,
DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
PtrHi,
@@ -10364,7 +10559,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, nullptr);
case 32:
case 16:
- RC = &AMDGPU::SReg_32_XM0RegClass;
+ RC = &AMDGPU::SReg_32RegClass;
break;
case 64:
RC = &AMDGPU::SGPR_64RegClass;
@@ -10373,7 +10568,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
RC = &AMDGPU::SReg_96RegClass;
break;
case 128:
- RC = &AMDGPU::SReg_128RegClass;
+ RC = &AMDGPU::SGPR_128RegClass;
break;
case 160:
RC = &AMDGPU::SReg_160RegClass;
@@ -10415,6 +10610,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
}
break;
case 'a':
+ if (!Subtarget->hasMAIInsts())
+ break;
switch (VT.getSizeInBits()) {
default:
return std::make_pair(0U, nullptr);
@@ -10548,9 +10745,9 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
}
-unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
- const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
- const unsigned CacheLineAlign = 6; // log2(64)
+Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+ const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
+ const Align CacheLineAlign = Align(64);
// Pre-GFX10 target did not benefit from loop alignment
if (!ML || DisableLoopAlignment ||
@@ -10578,7 +10775,7 @@ unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
// If inner loop block is aligned assume in average half of the alignment
// size to be added as nops.
if (MBB != Header)
- LoopSize += (1 << MBB->getAlignment()) / 2;
+ LoopSize += MBB->getAlignment().value() / 2;
for (const MachineInstr &MI : *MBB) {
LoopSize += TII->getInstSizeInBytes(MI);
@@ -10644,7 +10841,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
const MachineRegisterInfo &MRI = MF->getRegInfo();
const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
unsigned Reg = R->getReg();
- if (TRI.isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return !TRI.isSGPRReg(MRI, Reg);
if (MRI.isLiveIn(Reg)) {
@@ -10683,12 +10880,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
case ISD::INTRINSIC_W_CHAIN:
return AMDGPU::isIntrinsicSourceOfDivergence(
cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
- // In some cases intrinsics that are a source of divergence have been
- // lowered to AMDGPUISD so we also need to check those too.
- case AMDGPUISD::INTERP_MOV:
- case AMDGPUISD::INTERP_P1:
- case AMDGPUISD::INTERP_P2:
- return true;
}
return false;
}
@@ -10748,3 +10939,110 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
}
+
+const TargetRegisterClass *
+SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
+ const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
+ return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
+ : &AMDGPU::SReg_32RegClass;
+ if (!TRI->isSGPRClass(RC) && !isDivergent)
+ return TRI->getEquivalentSGPRClass(RC);
+ else if (TRI->isSGPRClass(RC) && isDivergent)
+ return TRI->getEquivalentVGPRClass(RC);
+
+ return RC;
+}
+
+static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
+ if (!Visited.insert(V).second)
+ return false;
+ bool Result = false;
+ for (auto U : V->users()) {
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
+ if (V == U->getOperand(1)) {
+ switch (Intrinsic->getIntrinsicID()) {
+ default:
+ Result = false;
+ break;
+ case Intrinsic::amdgcn_if_break:
+ case Intrinsic::amdgcn_if:
+ case Intrinsic::amdgcn_else:
+ Result = true;
+ break;
+ }
+ }
+ if (V == U->getOperand(0)) {
+ switch (Intrinsic->getIntrinsicID()) {
+ default:
+ Result = false;
+ break;
+ case Intrinsic::amdgcn_end_cf:
+ case Intrinsic::amdgcn_loop:
+ Result = true;
+ break;
+ }
+ }
+ } else {
+ Result = hasCFUser(U, Visited);
+ }
+ if (Result)
+ break;
+ }
+ return Result;
+}
+
+bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
+ const Value *V) const {
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+ switch (Intrinsic->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::amdgcn_if_break:
+ return true;
+ }
+ }
+ if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) {
+ if (const IntrinsicInst *Intrinsic =
+ dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) {
+ switch (Intrinsic->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::amdgcn_if:
+ case Intrinsic::amdgcn_else: {
+ ArrayRef<unsigned> Indices = ExtValue->getIndices();
+ if (Indices.size() == 1 && Indices[0] == 1) {
+ return true;
+ }
+ }
+ }
+ }
+ }
+ if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+ if (isa<InlineAsm>(CI->getCalledValue())) {
+ const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
+ ImmutableCallSite CS(CI);
+ TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
+ MF.getDataLayout(), Subtarget->getRegisterInfo(), CS);
+ for (auto &TC : TargetConstraints) {
+ if (TC.Type == InlineAsm::isOutput) {
+ ComputeConstraintToUse(TC, SDValue());
+ unsigned AssignedReg;
+ const TargetRegisterClass *RC;
+ std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint(
+ SIRI, TC.ConstraintCode, TC.ConstraintVT);
+ if (RC) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg))
+ return true;
+ else if (SIRI->isSGPRClass(RC))
+ return true;
+ }
+ }
+ }
+ }
+ }
+ SmallPtrSet<const Value *, 16> Visited;
+ return hasCFUser(V, Visited);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 21a215e16ce7..f0102feb65c4 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -94,6 +94,9 @@ private:
SelectionDAG &DAG, ArrayRef<SDValue> Ops,
bool IsIntrinsic = false) const;
+ SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG,
+ ArrayRef<SDValue> Ops) const;
+
// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
// dwordx4 if on SI.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
@@ -183,6 +186,7 @@ private:
unsigned isCFIntrinsic(const SDNode *Intr) const;
+public:
/// \returns True if fixup needs to be emitted for given global value \p GV,
/// false otherwise.
bool shouldEmitFixup(const GlobalValue *GV) const;
@@ -195,11 +199,14 @@ private:
/// global value \p GV, false otherwise.
bool shouldEmitPCReloc(const GlobalValue *GV) const;
+private:
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
- void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
- SDValue *Offsets, unsigned Align = 4) const;
+ /// \returns 0 If there is a non-constant offset or if the offset is 0.
+ /// Otherwise returns the constant offset.
+ unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
+ SDValue *Offsets, unsigned Align = 4) const;
// Handle 8 bit and 16 bit buffer loads
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
@@ -235,6 +242,11 @@ public:
bool canMergeStoresTo(unsigned AS, EVT MemVT,
const SelectionDAG &DAG) const override;
+ bool allowsMisalignedMemoryAccessesImpl(
+ unsigned Size, unsigned AS, unsigned Align,
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+ bool *IsFast = nullptr) const;
+
bool allowsMisalignedMemoryAccesses(
EVT VT, unsigned AS, unsigned Align,
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
@@ -309,12 +321,13 @@ public:
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
MachineBasicBlock *splitKillBlock(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ void bundleInstWithWaitcnt(MachineInstr &MI) const;
MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI,
MachineBasicBlock *BB) const;
@@ -330,6 +343,7 @@ public:
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
+ SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
@@ -374,7 +388,37 @@ public:
unsigned Depth = 0) const override;
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
- unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
+ virtual const TargetRegisterClass *
+ getRegClassFor(MVT VT, bool isDivergent) const override;
+ virtual bool requiresUniformRegister(MachineFunction &MF,
+ const Value *V) const override;
+ Align getPrefLoopAlignment(MachineLoop *ML) const override;
+
+ void allocateHSAUserSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
+
+ void allocateSystemSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ SIMachineFunctionInfo &Info,
+ CallingConv::ID CallConv,
+ bool IsShader) const;
+
+ void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
+ void allocateSpecialInputSGPRs(
+ CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
+
+ void allocateSpecialInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c89d5b71ec5c..dcb04e426584 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1483,12 +1483,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (BI.Incoming) {
if (!Brackets)
- Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
+ Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
else
*Brackets = *BI.Incoming;
} else {
if (!Brackets)
- Brackets = llvm::make_unique<WaitcntBrackets>(ST);
+ Brackets = std::make_unique<WaitcntBrackets>(ST);
else
Brackets->clear();
}
@@ -1508,7 +1508,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (!MoveBracketsToSucc) {
MoveBracketsToSucc = &SuccBI;
} else {
- SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
+ SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
}
} else if (SuccBI.Incoming->merge(*Brackets)) {
SuccBI.Dirty = true;
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 561a16c3e351..4dcbe92861f2 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -124,6 +124,9 @@ class InstSI <dag outs, dag ins, string asm = "",
// This bit indicates that this is one of MFMA instructions.
field bit IsMAI = 0;
+ // This bit indicates that this is one of DOT instructions.
+ field bit IsDOT = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -189,6 +192,8 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{54} = IsMAI;
+ let TSFlags{55} = IsDOT;
+
let SchedRW = [Write32Bit];
field bits<1> DisableSIDecoder = 0;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index ba8ed6993a56..d97e6a62971b 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -318,8 +318,25 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
- if (SOffset && SOffset->isReg())
- return false;
+ if (SOffset && SOffset->isReg()) {
+ // We can only handle this if it's a stack access, as any other resource
+ // would require reporting multiple base registers.
+ const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ if (AddrReg && !AddrReg->isFI())
+ return false;
+
+ const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
+ const SIMachineFunctionInfo *MFI
+ = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
+ if (RSrc->getReg() != MFI->getScratchRSrcReg())
+ return false;
+
+ const MachineOperand *OffsetImm =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ BaseOp = SOffset;
+ Offset = OffsetImm->getImm();
+ return true;
+ }
const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
if (!AddrReg)
@@ -458,9 +475,9 @@ bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
const MachineRegisterInfo &MRI =
FirstLdSt.getParent()->getParent()->getRegInfo();
- const unsigned Reg = FirstDst->getReg();
+ const Register Reg = FirstDst->getReg();
- const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg)
+ const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg)
? MRI.getRegClass(Reg)
: RI.getPhysRegClass(Reg);
@@ -807,7 +824,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
"Not a VGPR32 reg");
if (Cond.size() == 1) {
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(Cond[0]);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
@@ -820,7 +837,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
assert(Cond[0].isImm() && "Cond[0] is not an immediate");
switch (Cond[0].getImm()) {
case SIInstrInfo::SCC_TRUE: {
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
: AMDGPU::S_CSELECT_B64), SReg)
.addImm(-1)
@@ -834,7 +851,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
break;
}
case SIInstrInfo::SCC_FALSE: {
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
: AMDGPU::S_CSELECT_B64), SReg)
.addImm(0)
@@ -850,7 +867,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
case SIInstrInfo::VCCNZ: {
MachineOperand RegOp = Cond[1];
RegOp.setImplicit(false);
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(RegOp);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
@@ -864,7 +881,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
case SIInstrInfo::VCCZ: {
MachineOperand RegOp = Cond[1];
RegOp.setImplicit(false);
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(RegOp);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
@@ -876,8 +893,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
break;
}
case SIInstrInfo::EXECNZ: {
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
- unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
.addImm(0);
@@ -894,8 +911,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
break;
}
case SIInstrInfo::EXECZ: {
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
- unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
.addImm(0);
@@ -925,7 +942,7 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
const DebugLoc &DL,
unsigned SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC());
+ Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
.addImm(Value)
.addReg(SrcReg);
@@ -938,7 +955,7 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
const DebugLoc &DL,
unsigned SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC());
+ Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
.addImm(Value)
.addReg(SrcReg);
@@ -1052,12 +1069,12 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// The SGPR spill/restore instructions only work on number sgprs, so we need
// to make sure we are using the correct register class.
- if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
+ if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
}
- MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
+ BuildMI(MBB, MI, DL, OpDesc)
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
@@ -1068,11 +1085,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// correctly handled.
if (RI.spillSGPRToVGPR())
FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
- if (ST.hasScalarStores()) {
- // m0 is used for offset to scalar stores if used to spill.
- Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
- }
-
return;
}
@@ -1083,7 +1095,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
auto MIB = BuildMI(MBB, MI, DL, get(Opcode));
if (RI.hasAGPRs(RC)) {
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MIB.addReg(Tmp, RegState::Define);
}
MIB.addReg(SrcReg, getKillRegState(isKill)) // data
@@ -1182,24 +1194,18 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
// FIXME: Maybe this should not include a memoperand because it will be
// lowered to non-memory instructions.
const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
- if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
+ if (Register::isVirtualRegister(DestReg) && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
if (RI.spillSGPRToVGPR())
FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
- MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
+ BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
-
- if (ST.hasScalarStores()) {
- // m0 is used for offset to scalar stores if used to spill.
- Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
- }
-
return;
}
@@ -1208,7 +1214,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg);
if (RI.hasAGPRs(RC)) {
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MIB.addReg(Tmp, RegState::Define);
}
MIB.addFrameIndex(FrameIndex) // vaddr
@@ -1242,13 +1248,13 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) &&
WorkGroupSize > WavefrontSize) {
- unsigned TIDIGXReg
- = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
- unsigned TIDIGYReg
- = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
- unsigned TIDIGZReg
- = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
- unsigned InputPtrReg =
+ Register TIDIGXReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+ Register TIDIGYReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+ Register TIDIGZReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+ Register InputPtrReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
if (!Entry.isLiveIn(Reg))
@@ -1410,9 +1416,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
break;
case AMDGPU::V_MOV_B64_PSEUDO: {
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
- unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+ Register Dst = MI.getOperand(0).getReg();
+ Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+ Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
const MachineOperand &SrcOp = MI.getOperand(1);
// FIXME: Will this work for 64-bit floating point immediates?
@@ -1437,6 +1443,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
+ case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
+ expandMovDPP64(MI);
+ break;
+ }
case AMDGPU::V_SET_INACTIVE_B32: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
@@ -1469,7 +1479,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::V_MOVRELD_B32_V8:
case AMDGPU::V_MOVRELD_B32_V16: {
const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
- unsigned VecReg = MI.getOperand(0).getReg();
+ Register VecReg = MI.getOperand(0).getReg();
bool IsUndef = MI.getOperand(1).isUndef();
unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
assert(VecReg == MI.getOperand(1).getReg());
@@ -1492,9 +1502,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
case AMDGPU::SI_PC_ADD_REL_OFFSET: {
MachineFunction &MF = *MBB.getParent();
- unsigned Reg = MI.getOperand(0).getReg();
- unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
- unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
+ Register Reg = MI.getOperand(0).getReg();
+ Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
+ Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
// Create a bundle so these instructions won't be re-ordered by the
// post-RA scheduler.
@@ -1531,7 +1541,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
break;
}
case TargetOpcode::BUNDLE: {
- if (!MI.mayLoad())
+ if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
return false;
// If it is a load it must be a memory clause
@@ -1550,6 +1560,64 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
+std::pair<MachineInstr*, MachineInstr*>
+SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
+ assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ Register Dst = MI.getOperand(0).getReg();
+ unsigned Part = 0;
+ MachineInstr *Split[2];
+
+
+ for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
+ auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
+ if (Dst.isPhysical()) {
+ MovDPP.addDef(RI.getSubReg(Dst, Sub));
+ } else {
+ assert(MRI.isSSA());
+ auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MovDPP.addDef(Tmp);
+ }
+
+ for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
+ const MachineOperand &SrcOp = MI.getOperand(I);
+ assert(!SrcOp.isFPImm());
+ if (SrcOp.isImm()) {
+ APInt Imm(64, SrcOp.getImm());
+ Imm.ashrInPlace(Part * 32);
+ MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
+ } else {
+ assert(SrcOp.isReg());
+ Register Src = SrcOp.getReg();
+ if (Src.isPhysical())
+ MovDPP.addReg(RI.getSubReg(Src, Sub));
+ else
+ MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
+ }
+ }
+
+ for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I)
+ MovDPP.addImm(MI.getOperand(I).getImm());
+
+ Split[Part] = MovDPP;
+ ++Part;
+ }
+
+ if (Dst.isVirtual())
+ BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
+ .addReg(Split[0]->getOperand(0).getReg())
+ .addImm(AMDGPU::sub0)
+ .addReg(Split[1]->getOperand(0).getReg())
+ .addImm(AMDGPU::sub1);
+
+ MI.eraseFromParent();
+ return std::make_pair(Split[0], Split[1]);
+}
+
bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
MachineOperand &Src0,
unsigned Src0OpName,
@@ -1574,7 +1642,7 @@ bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
MachineOperand &RegOp,
MachineOperand &NonRegOp) {
- unsigned Reg = RegOp.getReg();
+ Register Reg = RegOp.getReg();
unsigned SubReg = RegOp.getSubReg();
bool IsKill = RegOp.isKill();
bool IsDead = RegOp.isDead();
@@ -1646,7 +1714,8 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
// This needs to be implemented because the source modifiers may be inserted
// between the true commutable operands, and the base
// TargetInstrInfo::commuteInstruction uses it.
-bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
+bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx0,
unsigned &SrcOpIdx1) const {
return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
}
@@ -1710,7 +1779,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
// FIXME: Virtual register workaround for RegScavenger not working with empty
// blocks.
- unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
auto I = MBB.end();
@@ -2163,7 +2232,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
SmallVector<unsigned, 8> Regs;
for (int Idx = 0; Idx != NElts; ++Idx) {
- unsigned DstElt = MRI.createVirtualRegister(EltRC);
+ Register DstElt = MRI.createVirtualRegister(EltRC);
Regs.push_back(DstElt);
unsigned SubIdx = SubIndices[Idx];
@@ -2327,7 +2396,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
UseMI.RemoveOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
- unsigned Src1Reg = Src1->getReg();
+ Register Src1Reg = Src1->getReg();
unsigned Src1SubReg = Src1->getSubReg();
Src0->setReg(Src1Reg);
Src0->setSubReg(Src1SubReg);
@@ -2367,12 +2436,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
MRI->hasOneUse(Src0->getReg())) {
Src0->ChangeToImmediate(Def->getOperand(1).getImm());
Src0Inlined = true;
- } else if ((RI.isPhysicalRegister(Src0->getReg()) &&
- (ST.getConstantBusLimit(Opc) <= 1 &&
- RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
- (RI.isVirtualRegister(Src0->getReg()) &&
- (ST.getConstantBusLimit(Opc) <= 1 &&
- RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
+ } else if ((Register::isPhysicalRegister(Src0->getReg()) &&
+ (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
+ (Register::isVirtualRegister(Src0->getReg()) &&
+ (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
return false;
// VGPR is okay as Src0 - fallthrough
}
@@ -2385,10 +2454,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
MRI->hasOneUse(Src1->getReg()) &&
commuteInstruction(UseMI)) {
Src0->ChangeToImmediate(Def->getOperand(1).getImm());
- } else if ((RI.isPhysicalRegister(Src1->getReg()) &&
- RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
- (RI.isVirtualRegister(Src1->getReg()) &&
- RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+ } else if ((Register::isPhysicalRegister(Src1->getReg()) &&
+ RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
+ (Register::isVirtualRegister(Src1->getReg()) &&
+ RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
return false;
// VGPR is okay as Src1 - fallthrough
}
@@ -2472,8 +2541,7 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
}
bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
- const MachineInstr &MIb,
- AliasAnalysis *AA) const {
+ const MachineInstr &MIb) const {
assert((MIa.mayLoad() || MIa.mayStore()) &&
"MIa must load from or modify a memory location");
assert((MIb.mayLoad() || MIb.mayStore()) &&
@@ -2664,6 +2732,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
+ MI.getOpcode() == AMDGPU::S_DENORM_MODE ||
changesVGPRIndexingMode(MI);
}
@@ -2865,8 +2934,16 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
if (OpInfo.RegClass < 0)
return false;
- if (MO.isImm() && isInlineConstant(MO, OpInfo))
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+
+ if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
+ if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
+ OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::src2))
+ return false;
return RI.opCanUseInlineConstant(OpInfo.OperandType);
+ }
if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
return false;
@@ -2874,8 +2951,6 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
return true;
- const MachineFunction *MF = MI.getParent()->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
return ST.hasVOP3Literal();
}
@@ -3036,7 +3111,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
if (!MO.isUse())
return false;
- if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ if (Register::isVirtualRegister(MO.getReg()))
return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
// Null is free
@@ -3093,7 +3168,8 @@ static bool shouldReadExec(const MachineInstr &MI) {
return true;
}
- if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
+ if (MI.isPreISelOpcode() ||
+ SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
SIInstrInfo::isSALU(MI) ||
SIInstrInfo::isSMRD(MI))
return false;
@@ -3104,7 +3180,7 @@ static bool shouldReadExec(const MachineInstr &MI) {
static bool isSubRegOf(const SIRegisterInfo &TRI,
const MachineOperand &SuperVec,
const MachineOperand &SubReg) {
- if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
+ if (Register::isPhysicalRegister(SubReg.getReg()))
return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
@@ -3144,8 +3220,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (!Op.isReg())
continue;
- unsigned Reg = Op.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
+ Register Reg = Op.getReg();
+ if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) {
ErrInfo = "inlineasm operand has incorrect register class.";
return false;
}
@@ -3209,9 +3285,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
continue;
if (RegClass != -1) {
- unsigned Reg = MI.getOperand(i).getReg();
- if (Reg == AMDGPU::NoRegister ||
- TargetRegisterInfo::isVirtualRegister(Reg))
+ Register Reg = MI.getOperand(i).getReg();
+ if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg))
continue;
const TargetRegisterClass *RC = RI.getRegClass(RegClass);
@@ -3304,7 +3379,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
ErrInfo =
"Dst register should be tied to implicit use of preserved register";
return false;
- } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
+ } else if (Register::isPhysicalRegister(TiedMO.getReg()) &&
Dst.getReg() != TiedMO.getReg()) {
ErrInfo = "Dst register should use same physical register as preserved";
return false;
@@ -3409,6 +3484,32 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ // Special case for writelane - this can break the multiple constant bus rule,
+ // but still can't use more than one SGPR register
+ if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
+ unsigned SGPRCount = 0;
+ Register SGPRUsed = AMDGPU::NoRegister;
+
+ for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
+ if (OpIdx == -1)
+ break;
+
+ const MachineOperand &MO = MI.getOperand(OpIdx);
+
+ if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
+ if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
+ if (MO.getReg() != SGPRUsed)
+ ++SGPRCount;
+ SGPRUsed = MO.getReg();
+ }
+ }
+ if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
+ ErrInfo = "WRITELANE instruction violates constant bus restriction";
+ return false;
+ }
+ }
+ }
+
// Verify misc. restrictions on specific instructions.
if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
@@ -3609,7 +3710,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
ErrInfo = "Invalid dpp_ctrl value: "
- "broadcats are not supported on GFX10+";
+ "broadcasts are not supported on GFX10+";
return false;
}
if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
@@ -3631,6 +3732,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::PHI: return AMDGPU::PHI;
case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
case AMDGPU::WQM: return AMDGPU::WQM;
+ case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
case AMDGPU::WWM: return AMDGPU::WWM;
case AMDGPU::S_MOV_B32: {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
@@ -3708,9 +3810,9 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
const MCInstrDesc &Desc = get(MI.getOpcode());
if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
Desc.OpInfo[OpNo].RegClass == -1) {
- unsigned Reg = MI.getOperand(OpNo).getReg();
+ Register Reg = MI.getOperand(OpNo).getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
return MRI.getRegClass(Reg);
return RI.getPhysRegClass(Reg);
}
@@ -3741,7 +3843,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
else
VRC = &AMDGPU::VGPR_32RegClass;
- unsigned Reg = MRI.createVirtualRegister(VRC);
+ Register Reg = MRI.createVirtualRegister(VRC);
DebugLoc DL = MBB->findDebugLoc(I);
BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
MO.ChangeToRegister(Reg, false);
@@ -3756,7 +3858,7 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
const {
MachineBasicBlock *MBB = MI->getParent();
DebugLoc DL = MI->getDebugLoc();
- unsigned SubReg = MRI.createVirtualRegister(SubRC);
+ Register SubReg = MRI.createVirtualRegister(SubRC);
if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
@@ -3768,7 +3870,7 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
// value so we don't need to worry about merging its subreg index with the
// SubIdx passed to this function. The register coalescer should be able to
// eliminate this extra copy.
- unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+ Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
.addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
@@ -3814,11 +3916,10 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
if (!MO.isReg())
return false;
- unsigned Reg = MO.getReg();
- const TargetRegisterClass *RC =
- TargetRegisterInfo::isVirtualRegister(Reg) ?
- MRI.getRegClass(Reg) :
- RI.getPhysRegClass(Reg);
+ Register Reg = MO.getReg();
+ const TargetRegisterClass *RC = Register::isVirtualRegister(Reg)
+ ? MRI.getRegClass(Reg)
+ : RI.getPhysRegClass(Reg);
const SIRegisterInfo *TRI =
static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
@@ -3935,13 +4036,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
if (Opc == AMDGPU::V_WRITELANE_B32) {
const DebugLoc &DL = MI.getDebugLoc();
if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src0);
Src0.ChangeToRegister(Reg, false);
}
if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src1);
@@ -3967,7 +4068,7 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
// select is uniform.
if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
RI.isVGPR(MRI, Src1.getReg())) {
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src1);
@@ -4003,7 +4104,7 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
MI.setDesc(get(CommutedOpc));
- unsigned Src0Reg = Src0.getReg();
+ Register Src0Reg = Src0.getReg();
unsigned Src0SubReg = Src0.getSubReg();
bool Src0Kill = Src0.isKill();
@@ -4039,13 +4140,13 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
const DebugLoc &DL = MI.getDebugLoc();
if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src1);
Src1.ChangeToRegister(Reg, false);
}
if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src2);
Src2.ChangeToRegister(Reg, false);
@@ -4113,12 +4214,12 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
MachineRegisterInfo &MRI) const {
const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
- unsigned DstReg = MRI.createVirtualRegister(SRC);
+ Register DstReg = MRI.createVirtualRegister(SRC);
unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
if (RI.hasAGPRs(VRC)) {
VRC = RI.getEquivalentVGPRClass(VRC);
- unsigned NewSrcReg = MRI.createVirtualRegister(VRC);
+ Register NewSrcReg = MRI.createVirtualRegister(VRC);
BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
get(TargetOpcode::COPY), NewSrcReg)
.addReg(SrcReg);
@@ -4134,7 +4235,7 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
SmallVector<unsigned, 8> SRegs;
for (unsigned i = 0; i < SubRegs; ++i) {
- unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
.addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
@@ -4176,7 +4277,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
MachineOperand &Op,
MachineRegisterInfo &MRI,
const DebugLoc &DL) const {
- unsigned OpReg = Op.getReg();
+ Register OpReg = Op.getReg();
unsigned OpSubReg = Op.getSubReg();
const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
@@ -4186,7 +4287,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
if (DstRC == OpRC)
return;
- unsigned DstReg = MRI.createVirtualRegister(DstRC);
+ Register DstReg = MRI.createVirtualRegister(DstRC);
MachineInstr *Copy =
BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
@@ -4198,8 +4299,19 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
return;
// Try to eliminate the copy if it is copying an immediate value.
- if (Def->isMoveImmediate())
+ if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
FoldImmediate(*Copy, *Def, OpReg, &MRI);
+
+ bool ImpDef = Def->isImplicitDef();
+ while (!ImpDef && Def && Def->isCopy()) {
+ if (Def->getOperand(1).getReg().isPhysical())
+ break;
+ Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
+ ImpDef = Def && Def->isImplicitDef();
+ }
+ if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
+ !ImpDef)
+ Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
}
// Emit the actual waterfall loop, executing the wrapped instruction for each
@@ -4223,18 +4335,18 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
MachineBasicBlock::iterator I = LoopBB.begin();
- unsigned VRsrc = Rsrc.getReg();
+ Register VRsrc = Rsrc.getReg();
unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
- unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
- unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
- unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
- unsigned AndCond = MRI.createVirtualRegister(BoolXExecRC);
- unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+ Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+ Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
+ Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
+ Register AndCond = MRI.createVirtualRegister(BoolXExecRC);
+ Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
// Beginning of the loop, read the next Rsrc variant.
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
@@ -4302,7 +4414,7 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+ Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
// Save the EXEC mask
BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
@@ -4370,10 +4482,10 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
// Create an empty resource descriptor
- unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+ Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
// Zero64 = 0
@@ -4430,7 +4542,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
if (!MI.getOperand(i).isReg() ||
- !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+ !Register::isVirtualRegister(MI.getOperand(i).getReg()))
continue;
const TargetRegisterClass *OpRC =
MRI.getRegClass(MI.getOperand(i).getReg());
@@ -4447,8 +4559,16 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
if (!VRC) {
assert(SRC);
- VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC)
- : RI.getEquivalentVGPRClass(SRC);
+ if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
+ VRC = &AMDGPU::VReg_1RegClass;
+ } else
+ VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
+ ? RI.getEquivalentAGPRClass(SRC)
+ : RI.getEquivalentVGPRClass(SRC);
+ } else {
+ VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
+ ? RI.getEquivalentAGPRClass(VRC)
+ : RI.getEquivalentVGPRClass(VRC);
}
RC = VRC;
} else {
@@ -4458,7 +4578,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
// Update all the operands so they have the same type.
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
MachineOperand &Op = MI.getOperand(I);
- if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+ if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg()))
continue;
// MI is a PHI instruction.
@@ -4483,7 +4603,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
// subregister index types e.g. sub0_sub1 + sub2 + sub3
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
MachineOperand &Op = MI.getOperand(I);
- if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+ if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg()))
continue;
const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
@@ -4502,8 +4622,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
// Legalize INSERT_SUBREG
// src0 must have the same register class as dst
if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src0 = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(1).getReg();
const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
if (DstRC != Src0RC) {
@@ -4577,13 +4697,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
// This is already an ADDR64 instruction so we need to add the pointer
// extracted from the resource descriptor to the current value of VAddr.
- unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
- unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
+ Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
+ Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
unsigned RsrcPtr, NewSRsrc;
std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
@@ -4623,7 +4743,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
unsigned RsrcPtr, NewSRsrc;
std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
- unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
@@ -4661,6 +4781,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
MIB.addImm(TFE->getImm());
}
+ MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
+
MIB.cloneMemRefs(MI);
Addr64 = MIB;
} else {
@@ -4933,8 +5055,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
unsigned NewDstReg = AMDGPU::NoRegister;
if (HasDst) {
- unsigned DstReg = Inst.getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ Register DstReg = Inst.getOperand(0).getReg();
+ if (Register::isPhysicalRegister(DstReg))
continue;
// Update the destination register class.
@@ -4943,7 +5065,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
continue;
if (Inst.isCopy() &&
- TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
+ Register::isVirtualRegister(Inst.getOperand(1).getReg()) &&
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
// Instead of creating a copy where src and dst are the same register
// class, we just replace all uses of dst with src. These kinds of
@@ -4988,8 +5110,8 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- unsigned OldDstReg = Inst.getOperand(0).getReg();
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register OldDstReg = Inst.getOperand(0).getReg();
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned Opc = Inst.getOpcode();
assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
@@ -5022,8 +5144,8 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src = Inst.getOperand(1);
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned SubOp = ST.hasAddNoCarry() ?
AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
@@ -5052,7 +5174,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
MachineOperand &Src1 = Inst.getOperand(2);
if (ST.hasDLInsts()) {
- unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
@@ -5072,8 +5194,8 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
bool Src1IsSGPR = Src1.isReg() &&
RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
MachineInstr *Xor;
- unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
// Build a pair of scalar instructions and add them to the work list.
// The next iteration over the work list will lower these to the vector
@@ -5117,8 +5239,8 @@ void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
MachineOperand &Src0 = Inst.getOperand(1);
MachineOperand &Src1 = Inst.getOperand(2);
- unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
.add(Src0)
@@ -5146,8 +5268,8 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
MachineOperand &Src0 = Inst.getOperand(1);
MachineOperand &Src1 = Inst.getOperand(2);
- unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
.add(Src1);
@@ -5189,16 +5311,16 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
- unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+ Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
AMDGPU::sub1, Src0SubRC);
- unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+ Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
- unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+ Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
@@ -5226,12 +5348,12 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
- unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned CarryReg = MRI.createVirtualRegister(CarryRC);
- unsigned DeadCarryReg = MRI.createVirtualRegister(CarryRC);
+ Register CarryReg = MRI.createVirtualRegister(CarryRC);
+ Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src0 = Inst.getOperand(1);
@@ -5327,17 +5449,17 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
- unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+ Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
.add(SrcReg0Sub0)
.add(SrcReg1Sub0);
- unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+ Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
.add(SrcReg0Sub1)
.add(SrcReg1Sub1);
- unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+ Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
@@ -5368,7 +5490,7 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
- unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
MachineOperand* Op0;
MachineOperand* Op1;
@@ -5384,7 +5506,7 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
.add(*Op0);
- unsigned NewDest = MRI.createVirtualRegister(DestRC);
+ Register NewDest = MRI.createVirtualRegister(DestRC);
MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
.addReg(Interm)
@@ -5411,8 +5533,8 @@ void SIInstrInfo::splitScalar64BitBCNT(
MRI.getRegClass(Src.getReg()) :
&AMDGPU::SGPR_32RegClass;
- unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
@@ -5451,9 +5573,9 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
Offset == 0 && "Not implemented");
if (BitWidth < 32) {
- unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
.addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
@@ -5476,8 +5598,8 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
}
MachineOperand &Src = Inst.getOperand(1);
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
.addImm(31)
@@ -5506,6 +5628,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
switch (UseMI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::WQM:
+ case AMDGPU::SOFT_WQM:
case AMDGPU::WWM:
case AMDGPU::REG_SEQUENCE:
case AMDGPU::PHI:
@@ -5531,7 +5654,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
MachineRegisterInfo &MRI,
MachineInstr &Inst) const {
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineBasicBlock *MBB = Inst.getParent();
MachineOperand &Src0 = Inst.getOperand(1);
MachineOperand &Src1 = Inst.getOperand(2);
@@ -5539,8 +5662,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
switch (Inst.getOpcode()) {
case AMDGPU::S_PACK_LL_B32_B16: {
- unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
// FIXME: Can do a lot better if we know the high bits of src0 or src1 are
// 0.
@@ -5558,7 +5681,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
break;
}
case AMDGPU::S_PACK_LH_B32_B16: {
- unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
.addImm(0xffff);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
@@ -5568,8 +5691,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
break;
}
case AMDGPU::S_PACK_HH_B32_B16: {
- unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
.addImm(16)
.add(Src0);
@@ -5623,17 +5746,27 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
case AMDGPU::REG_SEQUENCE:
case AMDGPU::INSERT_SUBREG:
case AMDGPU::WQM:
+ case AMDGPU::SOFT_WQM:
case AMDGPU::WWM: {
const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
if (RI.hasAGPRs(SrcRC)) {
if (RI.hasAGPRs(NewDstRC))
return nullptr;
- NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
+ switch (Inst.getOpcode()) {
+ case AMDGPU::PHI:
+ case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::INSERT_SUBREG:
+ NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
+ break;
+ default:
+ NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+ }
+
if (!NewDstRC)
return nullptr;
} else {
- if (RI.hasVGPRs(NewDstRC))
+ if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
return nullptr;
NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
@@ -5686,7 +5819,7 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
return MO.getReg();
// If this could be a VGPR or an SGPR, Check the dynamic register class.
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
if (RI.isSGPRClass(RegRC))
UsedSGPRs[i] = Reg;
@@ -5941,7 +6074,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
- unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC());
+ Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
MachineInstr *SIIF =
BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
.add(Branch->getOperand(0))
@@ -5968,8 +6101,8 @@ void SIInstrInfo::convertNonUniformLoopRegion(
if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
- unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC());
- unsigned BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
+ Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
+ Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
MachineInstrBuilder HeaderPHIBuilder =
BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
@@ -5979,7 +6112,7 @@ void SIInstrInfo::convertNonUniformLoopRegion(
HeaderPHIBuilder.addReg(BackEdgeReg);
} else {
MachineBasicBlock *PMBB = *PI;
- unsigned ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
+ Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
ZeroReg, 0);
HeaderPHIBuilder.addReg(ZeroReg);
@@ -6063,13 +6196,30 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- unsigned UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
+ Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
.addReg(UnusedCarry, RegState::Define | RegState::Dead);
}
+MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ Register DestReg,
+ RegScavenger &RS) const {
+ if (ST.hasAddNoCarry())
+ return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
+
+ Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
+ // TODO: Users need to deal with this.
+ if (!UnusedCarry.isValid())
+ return MachineInstrBuilder();
+
+ return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
+ .addReg(UnusedCarry, RegState::Define | RegState::Dead);
+}
+
bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
switch (Opcode) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
@@ -6115,7 +6265,21 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
return false;
const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
- return RCID == AMDGPU::SReg_128RegClassID;
+ return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
+}
+
+unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace,
+ bool Signed) const {
+ if (!ST.hasFlatInstOffsets())
+ return 0;
+
+ if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
+ return 0;
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX10)
+ return Signed ? 12 : 11;
+
+ return Signed ? 13 : 12;
}
bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
@@ -6254,7 +6418,7 @@ static bool followSubRegDef(MachineInstr &MI,
MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
MachineRegisterInfo &MRI) {
assert(MRI.isSSA());
- if (!TargetRegisterInfo::isVirtualRegister(P.Reg))
+ if (!Register::isVirtualRegister(P.Reg))
return nullptr;
auto RSR = P;
@@ -6265,8 +6429,7 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
case AMDGPU::COPY:
case AMDGPU::V_MOV_B32_e32: {
auto &Op1 = MI->getOperand(1);
- if (Op1.isReg() &&
- TargetRegisterInfo::isVirtualRegister(Op1.getReg())) {
+ if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg())) {
if (Op1.isUndef())
return nullptr;
RSR = getRegSubRegPair(Op1);
@@ -6360,3 +6523,40 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
return true;
}
}
+
+MachineInstr *SIInstrInfo::createPHIDestinationCopy(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
+ const DebugLoc &DL, Register Src, Register Dst) const {
+ auto Cur = MBB.begin();
+ if (Cur != MBB.end())
+ do {
+ if (!Cur->isPHI() && Cur->readsRegister(Dst))
+ return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
+ ++Cur;
+ } while (Cur != MBB.end() && Cur != LastPHIIt);
+
+ return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
+ Dst);
+}
+
+MachineInstr *SIInstrInfo::createPHISourceCopy(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
+ const DebugLoc &DL, Register Src, Register SrcSubReg, Register Dst) const {
+ if (InsPt != MBB.end() &&
+ (InsPt->getOpcode() == AMDGPU::SI_IF ||
+ InsPt->getOpcode() == AMDGPU::SI_ELSE ||
+ InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
+ InsPt->definesRegister(Src)) {
+ InsPt++;
+ return BuildMI(MBB, InsPt, InsPt->getDebugLoc(),
+ get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
+ : AMDGPU::S_MOV_B64_term),
+ Dst)
+ .addReg(Src, 0, SrcSubReg)
+ .addReg(AMDGPU::EXEC, RegState::Implicit);
+ }
+ return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
+ Dst);
+}
+
+bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 3ff35da0b963..be463442c888 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -173,7 +173,7 @@ public:
}
bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AliasAnalysis *AA) const override;
+ AAResults *AA) const override;
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
int64_t &Offset1,
@@ -229,6 +229,14 @@ public:
bool expandPostRAPseudo(MachineInstr &MI) const override;
+ // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
+ // instructions. Returns a pair of generated instructions.
+ // Can split either post-RA with physical registers or pre-RA with
+ // virtual registers. In latter case IR needs to be in SSA form and
+ // and a REG_SEQUENCE is produced to define original register.
+ std::pair<MachineInstr*, MachineInstr*>
+ expandMovDPP64(MachineInstr &MI) const;
+
// Returns an opcode that can be used to move a value to a \p DstRC
// register. If there is no hardware instruction that can store to \p
// DstRC, then AMDGPU::COPY is returned.
@@ -242,7 +250,7 @@ public:
return commuteOpcode(MI.getOpcode());
}
- bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+ bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
bool findCommutedOpIndices(MCInstrDesc Desc, unsigned & SrcOpIdx0,
@@ -303,8 +311,7 @@ public:
bool
areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
- const MachineInstr &MIb,
- AliasAnalysis *AA = nullptr) const override;
+ const MachineInstr &MIb) const override;
bool isFoldableCopy(const MachineInstr &MI) const;
@@ -578,6 +585,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::IsMAI;
}
+ static bool isDOT(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
+ }
+
+ bool isDOT(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
+ }
+
static bool isScalarUnit(const MachineInstr &MI) {
return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
}
@@ -954,6 +969,19 @@ public:
bool isBasicBlockPrologue(const MachineInstr &MI) const override;
+ MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsPt,
+ const DebugLoc &DL, Register Src,
+ Register Dst) const override;
+
+ MachineInstr *createPHISourceCopy(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsPt,
+ const DebugLoc &DL, Register Src,
+ Register SrcSubReg,
+ Register Dst) const override;
+
+ bool isWave32() const;
+
/// Return a partially built integer add instruction without carry.
/// Caller must add source operands.
/// For pre-GFX9 it will generate unused carry destination operand.
@@ -963,6 +991,12 @@ public:
const DebugLoc &DL,
unsigned DestReg) const;
+ MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ Register DestReg,
+ RegScavenger &RS) const;
+
static bool isKillTerminator(unsigned Opcode);
const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const;
@@ -970,6 +1004,8 @@ public:
return isUInt<12>(Imm);
}
+ unsigned getNumFlatOffsetBits(unsigned AddrSpace, bool Signed) const;
+
/// Returns if \p Offset is legal for the subtarget as the offset to a FLAT
/// encoded instruction. If \p Signed, this is for an instruction that
/// interprets the offset as signed.
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index c382c816e0b4..1eecbf555613 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -84,7 +84,7 @@ def SDTtbuffer_load : SDTypeProfile<1, 8,
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
SDTCisVT<6, i32>, // format(imm)
- SDTCisVT<7, i32>, // cachecontrol(imm)
+ SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm)
SDTCisVT<8, i1> // idxen(imm)
]>;
@@ -102,7 +102,7 @@ def SDTtbuffer_store : SDTypeProfile<0, 9,
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
SDTCisVT<6, i32>, // format(imm)
- SDTCisVT<7, i32>, // cachecontrol(imm)
+ SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm)
SDTCisVT<8, i1> // idxen(imm)
]>;
@@ -119,7 +119,7 @@ def SDTBufferLoad : SDTypeProfile<1, 7,
SDTCisVT<3, i32>, // voffset(VGPR)
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
- SDTCisVT<6, i32>, // cachepolicy(imm)
+ SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm)
SDTCisVT<7, i1>]>; // idxen(imm)
def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
@@ -145,7 +145,7 @@ def SDTBufferStore : SDTypeProfile<0, 8,
SDTCisVT<3, i32>, // voffset(VGPR)
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
- SDTCisVT<6, i32>, // cachepolicy(imm)
+ SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm)
SDTCisVT<7, i1>]>; // idxen(imm)
def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
@@ -198,6 +198,8 @@ def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">;
def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">;
def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
+def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
+def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>;
def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>;
@@ -264,6 +266,11 @@ def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8",
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
>;
+def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
+ SDTypeProfile<0 ,1, [SDTCisInt<0>]>,
+ [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]
+>;
+
//===----------------------------------------------------------------------===//
// ValueType helpers
//===----------------------------------------------------------------------===//
@@ -277,7 +284,9 @@ class isFloatType<ValueType SrcVT> {
!if(!eq(SrcVT.Value, f64.Value), 1,
!if(!eq(SrcVT.Value, v2f16.Value), 1,
!if(!eq(SrcVT.Value, v4f16.Value), 1,
- 0)))));
+ !if(!eq(SrcVT.Value, v2f32.Value), 1,
+ !if(!eq(SrcVT.Value, v2f64.Value), 1,
+ 0)))))));
}
class isIntType<ValueType SrcVT> {
@@ -300,14 +309,36 @@ class isPackedType<ValueType SrcVT> {
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
-defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
-defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
+foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
+let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
+
-def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
-def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
-def atomic_load_fadd_local : local_binary_atomic_op<atomic_load_fadd>;
-def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>;
-def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>;
+defm atomic_inc_#as : binary_atomic_op<SIatomic_inc>;
+defm atomic_dec_#as : binary_atomic_op<SIatomic_dec>;
+defm atomic_load_fmin_#as : binary_atomic_op<SIatomic_fmin, 0>;
+defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>;
+
+
+} // End let AddressSpaces = ...
+} // End foreach AddrSpace
+
+def atomic_fadd_global_noret : PatFrag<
+ (ops node:$ptr, node:$value),
+ (SIglobal_atomic_fadd node:$ptr, node:$value)> {
+ // FIXME: Move this
+ let MemoryVT = f32;
+ let IsAtomic = 1;
+ let AddressSpaces = StoreAddress_global.AddrSpaces;
+}
+
+def atomic_pk_fadd_global_noret : PatFrag<
+ (ops node:$ptr, node:$value),
+ (SIglobal_atomic_pk_fadd node:$ptr, node:$value)> {
+ // FIXME: Move this
+ let MemoryVT = v2f16;
+ let IsAtomic = 1;
+ let AddressSpaces = StoreAddress_global.AddrSpaces;
+}
//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
@@ -328,10 +359,12 @@ def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad,
>;
def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr)> {
+ let IsLoad = 1;
let IsUnindexed = 1;
}
def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr)> {
+ let IsLoad = 1;
let IsNonExtLoad = 1;
}
@@ -347,14 +380,15 @@ def atomic_load_64_glue : PatFrag<(ops node:$ptr),
let MemoryVT = i64;
}
-def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> {
+def extload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> {
let IsLoad = 1;
let IsAnyExtLoad = 1;
}
-def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
-}]>;
+def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let IsSignExtLoad = 1;
+}
def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> {
let IsLoad = 1;
@@ -391,25 +425,50 @@ def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> {
let MemoryVT = i16;
}
-def load_glue_align8 : Aligned8Bytes <
- (ops node:$ptr), (load_glue node:$ptr)
->;
-def load_glue_align16 : Aligned16Bytes <
- (ops node:$ptr), (load_glue node:$ptr)
->;
+let IsLoad = 1, AddressSpaces = LoadAddress_local.AddrSpaces in {
+def load_local_m0 : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> {
+ let IsNonExtLoad = 1;
+}
-def load_local_m0 : LoadFrag<load_glue>, LocalAddress;
-def sextloadi8_local_m0 : LoadFrag<sextloadi8_glue>, LocalAddress;
-def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress;
-def extloadi8_local_m0 : LoadFrag<extloadi8_glue>, LocalAddress;
-def zextloadi8_local_m0 : LoadFrag<zextloadi8_glue>, LocalAddress;
-def extloadi16_local_m0 : LoadFrag<extloadi16_glue>, LocalAddress;
-def zextloadi16_local_m0 : LoadFrag<zextloadi16_glue>, LocalAddress;
-def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress;
-def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress;
-def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress;
-def atomic_load_64_local_m0 : LoadFrag<atomic_load_64_glue>, LocalAddress;
+let MemoryVT = i8 in {
+def extloadi8_local_m0 : PatFrag<(ops node:$ptr), (extloadi8_glue node:$ptr)>;
+def sextloadi8_local_m0 : PatFrag<(ops node:$ptr), (sextloadi8_glue node:$ptr)>;
+def zextloadi8_local_m0 : PatFrag<(ops node:$ptr), (zextloadi8_glue node:$ptr)>;
+}
+
+let MemoryVT = i16 in {
+def extloadi16_local_m0 : PatFrag<(ops node:$ptr), (extloadi16_glue node:$ptr)>;
+def sextloadi16_local_m0 : PatFrag<(ops node:$ptr), (sextloadi16_glue node:$ptr)>;
+def zextloadi16_local_m0 : PatFrag<(ops node:$ptr), (zextloadi16_glue node:$ptr)>;
+}
+
+def load_align8_local_m0 : PatFrag<(ops node:$ptr),
+ (load_local_m0 node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+ let MinAlignment = 8;
+}
+def load_align16_local_m0 : PatFrag<(ops node:$ptr),
+ (load_local_m0 node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+ let MinAlignment = 16;
+}
+
+} // End IsLoad = 1
+
+let IsAtomic = 1, AddressSpaces = LoadAddress_local.AddrSpaces in {
+def atomic_load_32_local_m0 : PatFrag<(ops node:$ptr),
+ (atomic_load_32_glue node:$ptr)> {
+ let MemoryVT = i32;
+}
+def atomic_load_64_local_m0 : PatFrag<(ops node:$ptr),
+ (atomic_load_64_glue node:$ptr)> {
+ let MemoryVT = i64;
+}
+
+} // End let AddressSpaces = LoadAddress_local.AddrSpaces
def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore,
@@ -420,50 +479,88 @@ def AMDGPUatomic_st_glue : SDNode <"ISD::ATOMIC_STORE", SDTAtomicStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
>;
-def atomic_store_glue : PatFrag<(ops node:$ptr, node:$val),
- (AMDGPUatomic_st_glue node:$ptr, node:$val)> {
-}
-
def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr),
- (AMDGPUst_glue node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
-}]>;
+ (AMDGPUst_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let IsUnindexed = 1;
+}
def store_glue : PatFrag<(ops node:$val, node:$ptr),
- (unindexedstore_glue node:$val, node:$ptr), [{
- return !cast<StoreSDNode>(N)->isTruncatingStore();
-}]>;
+ (unindexedstore_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
def truncstore_glue : PatFrag<(ops node:$val, node:$ptr),
- (unindexedstore_glue node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->isTruncatingStore();
-}]>;
+ (unindexedstore_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 1;
+}
def truncstorei8_glue : PatFrag<(ops node:$val, node:$ptr),
- (truncstore_glue node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+ (truncstore_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let MemoryVT = i8;
+}
def truncstorei16_glue : PatFrag<(ops node:$val, node:$ptr),
- (truncstore_glue node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+ (truncstore_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let MemoryVT = i16;
+}
-def store_glue_align8 : Aligned8Bytes <
- (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
->;
+let IsStore = 1, AddressSpaces = StoreAddress_local.AddrSpaces in {
+def store_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+ (store_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
-def store_glue_align16 : Aligned16Bytes <
- (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
->;
+def truncstorei8_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+ (unindexedstore_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let MemoryVT = i8;
+}
+
+def truncstorei16_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+ (unindexedstore_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let MemoryVT = i16;
+}
+}
+
+def store_align16_local_m0 : PatFrag <
+ (ops node:$value, node:$ptr),
+ (store_local_m0 node:$value, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+ let MinAlignment = 16;
+}
-def store_local_m0 : StoreFrag<store_glue>, LocalAddress;
-def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress;
-def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress;
-def atomic_store_local_m0 : StoreFrag<AMDGPUatomic_st_glue>, LocalAddress;
+def store_align8_local_m0 : PatFrag <
+ (ops node:$value, node:$ptr),
+ (store_local_m0 node:$value, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+ let MinAlignment = 8;
+}
+
+let AddressSpaces = StoreAddress_local.AddrSpaces in {
+
+def atomic_store_local_32_m0 : PatFrag <
+ (ops node:$value, node:$ptr),
+ (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+ let IsAtomic = 1;
+ let MemoryVT = i32;
+}
+def atomic_store_local_64_m0 : PatFrag <
+ (ops node:$value, node:$ptr),
+ (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+ let IsAtomic = 1;
+ let MemoryVT = i64;
+}
+} // End let AddressSpaces = StoreAddress_local.AddrSpaces
-def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress;
-def store_align16_local_m0 : StoreFrag<store_glue_align16>, LocalAddress;
def si_setcc_uniform : PatFrag <
(ops node:$lhs, node:$rhs, node:$cond),
@@ -539,16 +636,27 @@ def lshl_rev : PatFrag <
(shl $src0, $src1)
>;
+def add_ctpop : PatFrag <
+ (ops node:$src0, node:$src1),
+ (add (ctpop $src0), $src1)
+>;
+
multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
- SDTypeProfile tc = SDTAtomic2> {
+ SDTypeProfile tc = SDTAtomic2,
+ bit IsInt = 1> {
def _glue : SDNode <
!if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
>;
- def _local_m0 : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
- def _region_m0 : region_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ let AddressSpaces = StoreAddress_local.AddrSpaces in {
+ defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
+ }
+
+ let AddressSpaces = StoreAddress_region.AddrSpaces in {
+ defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
+ }
}
defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
@@ -563,17 +671,9 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
-defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32>;
-defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>;
-defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>;
-
-def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
- [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
->;
-
-def atomic_cmp_swap_local_m0 : AtomicCmpSwapLocal<atomic_cmp_swap_glue>;
-def atomic_cmp_swap_region_m0 : AtomicCmpSwapRegion<atomic_cmp_swap_glue>;
-
+defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>;
+defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>;
+defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>;
def as_i1imm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
@@ -591,6 +691,10 @@ def as_i32imm: SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
}]>;
+def as_i32timm: SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
def as_i64imm: SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
}]>;
@@ -627,9 +731,13 @@ def SIMM16bit : ImmLeaf <i32,
>;
def UIMM16bit : ImmLeaf <i32,
- [{return isUInt<16>(Imm); }]
+ [{return isUInt<16>(Imm);}]
>;
+def i64imm_32bit : ImmLeaf<i64, [{
+ return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
+}]>;
+
class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
return isInlineImmediate(N);
}]>;
@@ -763,6 +871,18 @@ def ExpTgtMatchClass : AsmOperandClass {
let RenderMethod = "printExpTgt";
}
+def SWaitMatchClass : AsmOperandClass {
+ let Name = "SWaitCnt";
+ let RenderMethod = "addImmOperands";
+ let ParserMethod = "parseSWaitCntOps";
+}
+
+def VReg32OrOffClass : AsmOperandClass {
+ let Name = "VReg32OrOff";
+ let ParserMethod = "parseVReg32OrOff";
+}
+
+let OperandType = "OPERAND_IMMEDIATE" in {
def SendMsgImm : Operand<i32> {
let PrintMethod = "printSendMsg";
let ParserMatchClass = SendMsgMatchClass;
@@ -778,22 +898,11 @@ def EndpgmImm : Operand<i16> {
let ParserMatchClass = EndpgmMatchClass;
}
-def SWaitMatchClass : AsmOperandClass {
- let Name = "SWaitCnt";
- let RenderMethod = "addImmOperands";
- let ParserMethod = "parseSWaitCntOps";
-}
-
-def VReg32OrOffClass : AsmOperandClass {
- let Name = "VReg32OrOff";
- let ParserMethod = "parseVReg32OrOff";
-}
-
def WAIT_FLAG : Operand <i32> {
let ParserMatchClass = SWaitMatchClass;
let PrintMethod = "printWaitFlag";
- let OperandType = "OPERAND_IMMEDIATE";
}
+} // End OperandType = "OPERAND_IMMEDIATE"
include "SIInstrFormats.td"
include "VIInstrFormats.td"
@@ -929,6 +1038,7 @@ def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>;
def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>;
def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>;
@@ -1317,18 +1427,6 @@ class getVALUDstForVT<ValueType VT> {
VOPDstS64orS32)))); // else VT == i1
}
-// Returns true if VT is floating point.
-class getIsFP<ValueType VT> {
- bit ret = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, v2f16.Value), 1,
- !if(!eq(VT.Value, v4f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- !if(!eq(VT.Value, v2f32.Value), 1,
- !if(!eq(VT.Value, f64.Value), 1,
- !if(!eq(VT.Value, v2f64.Value), 1,
- 0)))))));
-}
-
// Returns the register class to use for the destination of VOP[12C]
// instructions with SDWA extension
class getSDWADstForVT<ValueType VT> {
@@ -1340,7 +1438,7 @@ class getSDWADstForVT<ValueType VT> {
// Returns the register class to use for source 0 of VOP[12C]
// instructions for the given VT.
class getVOPSrc0ForVT<ValueType VT> {
- bit isFP = getIsFP<VT>.ret;
+ bit isFP = isFloatType<VT>.ret;
RegisterOperand ret =
!if(isFP,
@@ -1373,11 +1471,14 @@ class getVOPSrc0ForVT<ValueType VT> {
// Returns the vreg register class to use for source operand given VT
class getVregSrcForVT<ValueType VT> {
RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
- !if(!eq(VT.Size, 64), VReg_64, VGPR_32));
+ !if(!eq(VT.Size, 96), VReg_96,
+ !if(!eq(VT.Size, 64), VReg_64,
+ !if(!eq(VT.Size, 48), VReg_64,
+ VGPR_32))));
}
class getSDWASrcForVT <ValueType VT> {
- bit isFP = getIsFP<VT>.ret;
+ bit isFP = isFloatType<VT>.ret;
RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32);
RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32);
RegisterOperand ret = !if(isFP, retFlt, retInt);
@@ -1386,7 +1487,7 @@ class getSDWASrcForVT <ValueType VT> {
// Returns the register class to use for sources of VOP3 instructions for the
// given VT.
class getVOP3SrcForVT<ValueType VT> {
- bit isFP = getIsFP<VT>.ret;
+ bit isFP = isFloatType<VT>.ret;
RegisterOperand ret =
!if(!eq(VT.Size, 128),
VSrc_128,
@@ -1433,7 +1534,7 @@ class isModifierType<ValueType SrcVT> {
// Return type of input modifiers operand for specified input operand
class getSrcMod <ValueType VT, bit EnableF32SrcMods> {
- bit isFP = getIsFP<VT>.ret;
+ bit isFP = isFloatType<VT>.ret;
bit isPacked = isPackedType<VT>.ret;
Operand ret = !if(!eq(VT.Size, 64),
!if(isFP, FP64InputMods, Int64InputMods),
@@ -1452,7 +1553,7 @@ class getOpSelMod <ValueType VT> {
// Return type of input modifiers operand specified input operand for DPP
class getSrcModExt <ValueType VT> {
- bit isFP = getIsFP<VT>.ret;
+ bit isFP = isFloatType<VT>.ret;
Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
}
@@ -2038,6 +2139,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field int NeedPatGen = PatGenMode.NoPattern;
field bit IsMAI = 0;
+ field bit IsDOT = 0;
field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 70f20bb69370..21984c6ad910 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -43,8 +43,8 @@ multiclass V_INTERP_P1_F32_m : VINTRP_m <
(outs VINTRPDst:$vdst),
(ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_p1_f32$vdst, $vsrc, $attr$attrchan",
- [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
- (i32 imm:$attr)))]
+ [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc,
+ (i32 timm:$attrchan), (i32 timm:$attr), M0))]
>;
let OtherPredicates = [has32BankLDS] in {
@@ -66,8 +66,8 @@ defm V_INTERP_P2_F32 : VINTRP_m <
(outs VINTRPDst:$vdst),
(ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_p2_f32$vdst, $vsrc, $attr$attrchan",
- [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
- (i32 imm:$attr)))]>;
+ [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc,
+ (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
@@ -76,8 +76,8 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
(outs VINTRPDst:$vdst),
(ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
- [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
- (i32 imm:$attr)))]>;
+ [(set f32:$vdst, (int_amdgcn_interp_mov (i32 imm:$vsrc),
+ (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
} // End Uses = [M0, EXEC]
@@ -92,6 +92,11 @@ def ATOMIC_FENCE : SPseudoInstSI<
let maybeAtomic = 1;
}
+def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> {
+ let HasExt = 1;
+ let HasExtDPP = 1;
+}
+
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
// For use in patterns
@@ -107,10 +112,19 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;
+// 64-bit vector move with dpp. Expanded post-RA.
+def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> {
+ let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
+}
+
// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
// WQM pass processes it.
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is
+// turned into a copy by WQM pass, but does not seed WQM requirements.
+def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+
// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
// that the @earlyclobber is respected. The @earlyclobber is to make sure that
// the instruction that defines $src0 (which is run in WWM) doesn't
@@ -345,13 +359,15 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
}
def SI_INIT_EXEC : SPseudoInstSI <
- (outs), (ins i64imm:$src), []> {
+ (outs), (ins i64imm:$src),
+ [(int_amdgcn_init_exec (i64 timm:$src))]> {
let Defs = [EXEC];
let usesCustomInserter = 1;
let isAsCheapAsAMove = 1;
let WaveSizePredicate = isWave64;
}
+// FIXME: Intrinsic should be mangled for wave size.
def SI_INIT_EXEC_LO : SPseudoInstSI <
(outs), (ins i32imm:$src), []> {
let Defs = [EXEC_LO];
@@ -360,12 +376,20 @@ def SI_INIT_EXEC_LO : SPseudoInstSI <
let WaveSizePredicate = isWave32;
}
+// FIXME: Wave32 version
def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
- (outs), (ins SSrc_b32:$input, i32imm:$shift), []> {
+ (outs), (ins SSrc_b32:$input, i32imm:$shift),
+ [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
let Defs = [EXEC];
let usesCustomInserter = 1;
}
+def : GCNPat <
+ (int_amdgcn_init_exec timm:$src),
+ (SI_INIT_EXEC_LO (as_i32imm imm:$src))> {
+ let WaveSizePredicate = isWave32;
+}
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -604,25 +628,6 @@ def : GCNPat <
(SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0))
>;
-def : GCNPat <
- (AMDGPUinit_exec i64:$src),
- (SI_INIT_EXEC (as_i64imm $src))
-> {
- let WaveSizePredicate = isWave64;
-}
-
-def : GCNPat <
- (AMDGPUinit_exec i64:$src),
- (SI_INIT_EXEC_LO (as_i32imm $src))
-> {
- let WaveSizePredicate = isWave32;
-}
-
-def : GCNPat <
- (AMDGPUinit_exec_from_input i32:$input, i32:$shift),
- (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift))
->;
-
def : GCNPat<
(AMDGPUtrap timm:$trapid),
(S_TRAP $trapid)
@@ -740,22 +745,22 @@ def : GCNPat <
def : GCNPat <
(i32 (fp_to_sint f16:$src)),
- (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src))
+ (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src))
>;
def : GCNPat <
(i32 (fp_to_uint f16:$src)),
- (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src))
+ (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src))
>;
def : GCNPat <
(f16 (sint_to_fp i32:$src)),
- (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src))
+ (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 VSrc_b32:$src))
>;
def : GCNPat <
(f16 (uint_to_fp i32:$src)),
- (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src))
+ (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 VSrc_b32:$src))
>;
//===----------------------------------------------------------------------===//
@@ -808,8 +813,14 @@ def : GCNPat <
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
}
+
def : GCNPat <
- (i16 (add (i16 (trunc (getDivergentFrag<ctpop>.ret i32:$popcnt))), i16:$val)),
+ (i32 (ctpop i32:$popcnt)),
+ (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0))
+>;
+
+def : GCNPat <
+ (i16 (add (i16 (trunc (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)))), i16:$val)),
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
@@ -1076,53 +1087,158 @@ def : GCNPat <
/********** ================================ **********/
// Prevent expanding both fneg and fabs.
+// TODO: Add IgnoredBySelectionDAG bit?
+let AddedComplexity = 1 in { // Prefer SALU to VALU patterns for DAG
def : GCNPat <
- (fneg (fabs f32:$src)),
- (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
+ (fneg (fabs (f32 SReg_32:$src))),
+ (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit
>;
-// FIXME: Should use S_OR_B32
def : GCNPat <
- (fneg (fabs f64:$src)),
- (REG_SEQUENCE VReg_64,
- (i32 (EXTRACT_SUBREG f64:$src, sub0)),
- sub0,
- (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
- (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
- sub1)
+ (fabs (f32 SReg_32:$src)),
+ (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff)))
+>;
+
+def : GCNPat <
+ (fneg (f32 SReg_32:$src)),
+ (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000)))
+>;
+
+def : GCNPat <
+ (fneg (f16 SReg_32:$src)),
+ (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000)))
+>;
+
+def : GCNPat <
+ (fneg (f16 VGPR_32:$src)),
+ (V_XOR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+ (fabs (f16 SReg_32:$src)),
+ (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff)))
+>;
+
+def : GCNPat <
+ (fneg (fabs (f16 SReg_32:$src))),
+ (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
+>;
+
+def : GCNPat <
+ (fneg (fabs (f16 VGPR_32:$src))),
+ (V_OR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
+>;
+
+def : GCNPat <
+ (fneg (v2f16 SReg_32:$src)),
+ (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000)))
+>;
+
+def : GCNPat <
+ (fabs (v2f16 SReg_32:$src)),
+ (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff)))
+>;
+
+// This is really (fneg (fabs v2f16:$src))
+//
+// fabs is not reported as free because there is modifier for it in
+// VOP3P instructions, so it is turned into the bit op.
+def : GCNPat <
+ (fneg (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))),
+ (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
>;
def : GCNPat <
- (fabs f32:$src),
- (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fffffff)))
+ (fneg (v2f16 (fabs SReg_32:$src))),
+ (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
+>;
+
+// FIXME: The implicit-def of scc from S_[X]OR_B32 is mishandled
+ // def : GCNPat <
+// (fneg (f64 SReg_64:$src)),
+// (REG_SEQUENCE SReg_64,
+// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
+// sub0,
+// (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
+// (i32 (S_MOV_B32 (i32 0x80000000)))),
+// sub1)
+// >;
+
+// def : GCNPat <
+// (fneg (fabs (f64 SReg_64:$src))),
+// (REG_SEQUENCE SReg_64,
+// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
+// sub0,
+// (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
+// (S_MOV_B32 (i32 0x80000000))), // Set sign bit.
+// sub1)
+// >;
+
+} // End let AddedComplexity = 1
+
+def : GCNPat <
+ (fabs (f32 VGPR_32:$src)),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+ (fneg (f32 VGPR_32:$src)),
+ (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+ (fabs (f16 VGPR_32:$src)),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
>;
def : GCNPat <
- (fneg f32:$src),
- (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
+ (fneg (v2f16 VGPR_32:$src)),
+ (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
>;
def : GCNPat <
- (fabs f64:$src),
+ (fabs (v2f16 VGPR_32:$src)),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+ (fneg (v2f16 (fabs VGPR_32:$src))),
+ (V_OR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) // Set sign bit
+>;
+
+def : GCNPat <
+ (fabs (f64 VReg_64:$src)),
(REG_SEQUENCE VReg_64,
- (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
sub0,
- (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+ (V_AND_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
(V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
sub1)
>;
+// TODO: Use SGPR for constant
def : GCNPat <
- (fneg f64:$src),
+ (fneg (f64 VReg_64:$src)),
(REG_SEQUENCE VReg_64,
- (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
sub0,
- (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+ (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
(i32 (V_MOV_B32_e32 (i32 0x80000000)))),
sub1)
>;
+// TODO: Use SGPR for constant
+def : GCNPat <
+ (fneg (fabs (f64 VReg_64:$src))),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
+ sub0,
+ (V_OR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
+ (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
+ sub1)
+>;
+
def : GCNPat <
(fcopysign f16:$src0, f16:$src1),
(V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -1154,45 +1270,6 @@ def : GCNPat <
(V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
>;
-def : GCNPat <
- (fneg f16:$src),
- (S_XOR_B32 $src, (S_MOV_B32 (i32 0x00008000)))
->;
-
-def : GCNPat <
- (fabs f16:$src),
- (S_AND_B32 $src, (S_MOV_B32 (i32 0x00007fff)))
->;
-
-def : GCNPat <
- (fneg (fabs f16:$src)),
- (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
->;
-
-def : GCNPat <
- (fneg v2f16:$src),
- (S_XOR_B32 $src, (S_MOV_B32 (i32 0x80008000)))
->;
-
-def : GCNPat <
- (fabs v2f16:$src),
- (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fff7fff)))
->;
-
-// This is really (fneg (fabs v2f16:$src))
-//
-// fabs is not reported as free because there is modifier for it in
-// VOP3P instructions, so it is turned into the bit op.
-def : GCNPat <
- (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
- (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
->;
-
-def : GCNPat <
- (fneg (v2f16 (fabs v2f16:$src))),
- (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
->;
-
/********** ================== **********/
/********** Immediate Patterns **********/
/********** ================== **********/
@@ -1544,7 +1621,7 @@ def : GCNPat <
(V_CVT_F16_F32_e32 (
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
- $src))
+ SSrc_i1:$src))
>;
def : GCNPat <
@@ -1552,35 +1629,35 @@ def : GCNPat <
(V_CVT_F16_F32_e32 (
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
- $src))
+ SSrc_i1:$src))
>;
def : GCNPat <
(f32 (sint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
- $src)
+ SSrc_i1:$src)
>;
def : GCNPat <
(f32 (uint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
- $src)
+ SSrc_i1:$src)
>;
def : GCNPat <
(f64 (sint_to_fp i1:$src)),
(V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 -1),
- $src))
+ SSrc_i1:$src))
>;
def : GCNPat <
(f64 (uint_to_fp i1:$src)),
(V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 1),
- $src))
+ SSrc_i1:$src))
>;
//===----------------------------------------------------------------------===//
@@ -1788,6 +1865,22 @@ def : GCNPat <
(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
>;
+def : GCNPat <
+ (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
+ timm:$bound_ctrl)),
+ (V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl),
+ (as_i32imm $row_mask), (as_i32imm $bank_mask),
+ (as_i1imm $bound_ctrl))
+>;
+
+def : GCNPat <
+ (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
+ timm:$bank_mask, timm:$bound_ctrl)),
+ (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl),
+ (as_i32imm $row_mask), (as_i32imm $bank_mask),
+ (as_i1imm $bound_ctrl))
+>;
+
//===----------------------------------------------------------------------===//
// Fract Patterns
//===----------------------------------------------------------------------===//
@@ -1915,3 +2008,13 @@ def : FP16Med3Pat<f16, V_MED3_F16>;
defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
} // End Predicates = [isGFX9Plus]
+
+class AMDGPUGenericInstruction : GenericInstruction {
+ let Namespace = "AMDGPU";
+}
+
+def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$src);
+ let hasSideEffects = 0;
+}
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ae8b967893a2..20db1c37f354 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -42,10 +42,7 @@
//
// Future improvements:
//
-// - This currently relies on the scheduler to place loads and stores next to
-// each other, and then only merges adjacent pairs of instructions. It would
-// be good to be more flexible with interleaved instructions, and possibly run
-// before scheduling. It currently missing stores of constants because loading
+// - This is currently missing stores of constants because loading
// the constant into the data register is placed between the stores, although
// this is arguably a scheduling problem.
//
@@ -98,14 +95,9 @@ enum InstClassEnum {
DS_READ,
DS_WRITE,
S_BUFFER_LOAD_IMM,
- BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
- BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
- BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
- BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
- BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
- BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
- BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
- BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
+ BUFFER_LOAD,
+ BUFFER_STORE,
+ MIMG,
};
enum RegisterEnum {
@@ -114,6 +106,7 @@ enum RegisterEnum {
SOFFSET = 0x4,
VADDR = 0x8,
ADDR = 0x10,
+ SSAMP = 0x20,
};
class SILoadStoreOptimizer : public MachineFunctionPass {
@@ -126,6 +119,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
unsigned Width0;
unsigned Width1;
unsigned BaseOff;
+ unsigned DMask0;
+ unsigned DMask1;
InstClassEnum InstClass;
bool GLC0;
bool GLC1;
@@ -135,6 +130,60 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
bool DLC1;
bool UseST64;
SmallVector<MachineInstr *, 8> InstsToMove;
+ int AddrIdx[5];
+ const MachineOperand *AddrReg[5];
+ unsigned NumAddresses;
+
+ bool hasSameBaseAddress(const MachineInstr &MI) {
+ for (unsigned i = 0; i < NumAddresses; i++) {
+ const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
+
+ if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
+ if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
+ AddrReg[i]->getImm() != AddrRegNext.getImm()) {
+ return false;
+ }
+ continue;
+ }
+
+ // Check same base pointer. Be careful of subregisters, which can occur
+ // with vectors of pointers.
+ if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
+ AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
+ for (unsigned i = 0; i < NumAddresses; ++i) {
+ const MachineOperand *AddrOp = AddrReg[i];
+ // Immediates are always OK.
+ if (AddrOp->isImm())
+ continue;
+
+ // Don't try to merge addresses that aren't either immediates or registers.
+ // TODO: Should be possible to merge FrameIndexes and maybe some other
+ // non-register
+ if (!AddrOp->isReg())
+ return false;
+
+ // TODO: We should be able to merge physical reg addreses.
+ if (Register::isPhysicalRegister(AddrOp->getReg()))
+ return false;
+
+ // If an address has only one use then there will be on other
+ // instructions with the same address, so we can't merge this one.
+ if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
+ return false;
+ }
+ return true;
+ }
+
+ void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
+ const GCNSubtarget &STM);
+ void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
};
struct BaseRegisters {
@@ -160,14 +209,12 @@ private:
AliasAnalysis *AA = nullptr;
bool OptimizeAgain;
+ static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII);
static bool offsetsCanBeCombined(CombineInfo &CI);
static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
static unsigned getNewOpcode(const CombineInfo &CI);
static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
- unsigned getOpcodeWidth(const MachineInstr &MI);
- InstClassEnum getInstClass(unsigned Opc);
- unsigned getRegs(unsigned Opc);
bool findMatchingInst(CombineInfo &CI);
@@ -178,22 +225,27 @@ private:
unsigned write2Opcode(unsigned EltSize) const;
unsigned write2ST64Opcode(unsigned EltSize) const;
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
+ MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI);
MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
- int32_t NewOffset);
- unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
- MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
- Optional<int32_t> extractConstOffset(const MachineOperand &Op);
- void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
+ int32_t NewOffset) const;
+ unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;
+ MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
+ Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
+ void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
/// Promotes constant offset to the immediate by adjusting the base. It
/// tries to use a base from the nearby instructions that allows it to have
/// a 13bit constant offset which gets promoted to the immediate.
bool promoteConstantOffsetToImm(MachineInstr &CI,
MemInfoMap &Visited,
- SmallPtrSet<MachineInstr *, 4> &Promoted);
+ SmallPtrSet<MachineInstr *, 4> &Promoted) const;
+ void addInstToMergeableList(const CombineInfo &CI,
+ std::list<std::list<CombineInfo> > &MergeableInsts) const;
+ bool collectMergeableInsts(MachineBasicBlock &MBB,
+ std::list<std::list<CombineInfo> > &MergeableInsts) const;
public:
static char ID;
@@ -202,7 +254,11 @@ public:
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
}
- bool optimizeBlock(MachineBasicBlock &MBB);
+ void removeCombinedInst(std::list<CombineInfo> &MergeList,
+ const MachineInstr &MI);
+ bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
+ bool &OptimizeListAgain);
+ bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -216,6 +272,264 @@ public:
}
};
+static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
+ const unsigned Opc = MI.getOpcode();
+
+ if (TII.isMUBUF(Opc)) {
+ // FIXME: Handle d16 correctly
+ return AMDGPU::getMUBUFElements(Opc);
+ }
+ if (TII.isMIMG(MI)) {
+ uint64_t DMaskImm =
+ TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
+ return countPopulation(DMaskImm);
+ }
+
+ switch (Opc) {
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ return 1;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ return 2;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return 4;
+ default:
+ return 0;
+ }
+}
+
+/// Maps instruction opcode to enum InstClassEnum.
+static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
+ switch (Opc) {
+ default:
+ if (TII.isMUBUF(Opc)) {
+ switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
+ default:
+ return UNKNOWN;
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
+ return BUFFER_LOAD;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+ return BUFFER_STORE;
+ }
+ }
+ if (TII.isMIMG(Opc)) {
+ // Ignore instructions encoded without vaddr.
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1)
+ return UNKNOWN;
+ // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
+ if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc))
+ return UNKNOWN;
+ return MIMG;
+ }
+ return UNKNOWN;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return S_BUFFER_LOAD_IMM;
+ case AMDGPU::DS_READ_B32:
+ case AMDGPU::DS_READ_B32_gfx9:
+ case AMDGPU::DS_READ_B64:
+ case AMDGPU::DS_READ_B64_gfx9:
+ return DS_READ;
+ case AMDGPU::DS_WRITE_B32:
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ case AMDGPU::DS_WRITE_B64:
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return DS_WRITE;
+ }
+}
+
+/// Determines instruction subclass from opcode. Only instructions
+/// of the same subclass can be merged together.
+static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
+ switch (Opc) {
+ default:
+ if (TII.isMUBUF(Opc))
+ return AMDGPU::getMUBUFBaseOpcode(Opc);
+ if (TII.isMIMG(Opc)) {
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
+ assert(Info);
+ return Info->BaseOpcode;
+ }
+ return -1;
+ case AMDGPU::DS_READ_B32:
+ case AMDGPU::DS_READ_B32_gfx9:
+ case AMDGPU::DS_READ_B64:
+ case AMDGPU::DS_READ_B64_gfx9:
+ case AMDGPU::DS_WRITE_B32:
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ case AMDGPU::DS_WRITE_B64:
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return Opc;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
+ }
+}
+
+static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) {
+ if (TII.isMUBUF(Opc)) {
+ unsigned result = 0;
+
+ if (AMDGPU::getMUBUFHasVAddr(Opc)) {
+ result |= VADDR;
+ }
+
+ if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
+ result |= SRSRC;
+ }
+
+ if (AMDGPU::getMUBUFHasSoffset(Opc)) {
+ result |= SOFFSET;
+ }
+
+ return result;
+ }
+
+ if (TII.isMIMG(Opc)) {
+ unsigned result = VADDR | SRSRC;
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
+ if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
+ result |= SSAMP;
+ return result;
+ }
+
+ switch (Opc) {
+ default:
+ return 0;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return SBASE;
+ case AMDGPU::DS_READ_B32:
+ case AMDGPU::DS_READ_B64:
+ case AMDGPU::DS_READ_B32_gfx9:
+ case AMDGPU::DS_READ_B64_gfx9:
+ case AMDGPU::DS_WRITE_B32:
+ case AMDGPU::DS_WRITE_B64:
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return ADDR;
+ }
+}
+
+
+void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
+ const SIInstrInfo &TII,
+ const GCNSubtarget &STM) {
+ I = MI;
+ unsigned Opc = MI->getOpcode();
+ InstClass = getInstClass(Opc, TII);
+
+ if (InstClass == UNKNOWN)
+ return;
+
+ switch (InstClass) {
+ case DS_READ:
+ EltSize =
+ (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
+ : 4;
+ break;
+ case DS_WRITE:
+ EltSize =
+ (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
+ : 4;
+ break;
+ case S_BUFFER_LOAD_IMM:
+ EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4);
+ break;
+ default:
+ EltSize = 4;
+ break;
+ }
+
+ if (InstClass == MIMG) {
+ DMask0 = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
+ } else {
+ int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
+ Offset0 = I->getOperand(OffsetIdx).getImm();
+ }
+
+ Width0 = getOpcodeWidth(*I, TII);
+
+ if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
+ Offset0 &= 0xffff;
+ } else if (InstClass != MIMG) {
+ GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
+ if (InstClass != S_BUFFER_LOAD_IMM) {
+ SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
+ }
+ DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
+ }
+
+ unsigned AddrOpName[5] = {0};
+ NumAddresses = 0;
+ const unsigned Regs = getRegs(I->getOpcode(), TII);
+
+ if (Regs & ADDR) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
+ }
+
+ if (Regs & SBASE) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
+ }
+
+ if (Regs & SRSRC) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
+ }
+
+ if (Regs & SOFFSET) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
+ }
+
+ if (Regs & VADDR) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
+ }
+
+ if (Regs & SSAMP) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp;
+ }
+
+ for (unsigned i = 0; i < NumAddresses; i++) {
+ AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
+ AddrReg[i] = &I->getOperand(AddrIdx[i]);
+ }
+
+ InstsToMove.clear();
+}
+
+void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
+ const SIInstrInfo &TII) {
+ Paired = MI;
+ assert(InstClass == getInstClass(Paired->getOpcode(), TII));
+
+ if (InstClass == MIMG) {
+ DMask1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dmask)->getImm();
+ } else {
+ int OffsetIdx =
+ AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset);
+ Offset1 = Paired->getOperand(OffsetIdx).getImm();
+ }
+
+ Width1 = getOpcodeWidth(*Paired, TII);
+ if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
+ Offset1 &= 0xffff;
+ } else if (InstClass != MIMG) {
+ GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm();
+ if (InstClass != S_BUFFER_LOAD_IMM) {
+ SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm();
+ }
+ DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm();
+ }
+}
+
+
} // end anonymous namespace.
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
@@ -249,8 +563,7 @@ static void addDefsUsesToList(const MachineInstr &MI,
if (Op.isReg()) {
if (Op.isDef())
RegDefs.insert(Op.getReg());
- else if (Op.readsReg() &&
- TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
+ else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
PhysRegUses.insert(Op.getReg());
}
}
@@ -282,7 +595,7 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
if (Use.isReg() &&
((Use.readsReg() && RegDefs.count(Use.getReg())) ||
(Use.isDef() && RegDefs.count(Use.getReg())) ||
- (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
+ (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
PhysRegUses.count(Use.getReg())))) {
Insts.push_back(&MI);
addDefsUsesToList(MI, RegDefs, PhysRegUses);
@@ -307,7 +620,59 @@ static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
return true;
}
+// This function assumes that \p A and \p B have are identical except for
+// size and offset, and they referecne adjacent memory.
+static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
+ const MachineMemOperand *A,
+ const MachineMemOperand *B) {
+ unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
+ unsigned Size = A->getSize() + B->getSize();
+ // This function adds the offset parameter to the existing offset for A,
+ // so we pass 0 here as the offset and then manually set it to the correct
+ // value after the call.
+ MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
+ MMO->setOffset(MinOffset);
+ return MMO;
+}
+
+bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) {
+ assert(CI.InstClass == MIMG);
+
+ // Ignore instructions with tfe/lwe set.
+ const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
+ const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
+
+ if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
+ return false;
+
+ // Check other optional immediate operands for equality.
+ unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
+ AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
+ AMDGPU::OpName::da, AMDGPU::OpName::r128};
+
+ for (auto op : OperandsToMatch) {
+ int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
+ if (AMDGPU::getNamedOperandIdx(CI.Paired->getOpcode(), op) != Idx)
+ return false;
+ if (Idx != -1 &&
+ CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm())
+ return false;
+ }
+
+ // Check DMask for overlaps.
+ unsigned MaxMask = std::max(CI.DMask0, CI.DMask1);
+ unsigned MinMask = std::min(CI.DMask0, CI.DMask1);
+
+ unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
+ if ((1u << AllowedBitsForMin) <= MinMask)
+ return false;
+
+ return true;
+}
+
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
+ assert(CI.InstClass != MIMG);
+
// XXX - Would the same offset be OK? Is there any reason this would happen or
// be useful?
if (CI.Offset0 == CI.Offset1)
@@ -384,164 +749,24 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
}
}
-unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
- const unsigned Opc = MI.getOpcode();
-
- if (TII->isMUBUF(MI)) {
- return AMDGPU::getMUBUFDwords(Opc);
- }
-
- switch (Opc) {
- default:
- return 0;
- case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
- return 1;
- case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
- return 2;
- case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
- return 4;
- }
-}
-
-InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
- if (TII->isMUBUF(Opc)) {
- const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
-
- // If we couldn't identify the opcode, bail out.
- if (baseOpcode == -1) {
- return UNKNOWN;
- }
-
- switch (baseOpcode) {
- default:
- return UNKNOWN;
- case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
- return BUFFER_LOAD_OFFEN;
- case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
- return BUFFER_LOAD_OFFSET;
- case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
- return BUFFER_STORE_OFFEN;
- case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
- return BUFFER_STORE_OFFSET;
- case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
- return BUFFER_LOAD_OFFEN_exact;
- case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
- return BUFFER_LOAD_OFFSET_exact;
- case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
- return BUFFER_STORE_OFFEN_exact;
- case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
- return BUFFER_STORE_OFFSET_exact;
- }
- }
-
- switch (Opc) {
- default:
- return UNKNOWN;
- case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
- case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
- case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
- return S_BUFFER_LOAD_IMM;
- case AMDGPU::DS_READ_B32:
- case AMDGPU::DS_READ_B64:
- case AMDGPU::DS_READ_B32_gfx9:
- case AMDGPU::DS_READ_B64_gfx9:
- return DS_READ;
- case AMDGPU::DS_WRITE_B32:
- case AMDGPU::DS_WRITE_B64:
- case AMDGPU::DS_WRITE_B32_gfx9:
- case AMDGPU::DS_WRITE_B64_gfx9:
- return DS_WRITE;
- }
-}
-
-unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
- if (TII->isMUBUF(Opc)) {
- unsigned result = 0;
-
- if (AMDGPU::getMUBUFHasVAddr(Opc)) {
- result |= VADDR;
- }
-
- if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
- result |= SRSRC;
- }
-
- if (AMDGPU::getMUBUFHasSoffset(Opc)) {
- result |= SOFFSET;
- }
-
- return result;
- }
-
- switch (Opc) {
- default:
- return 0;
- case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
- case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
- case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
- return SBASE;
- case AMDGPU::DS_READ_B32:
- case AMDGPU::DS_READ_B64:
- case AMDGPU::DS_READ_B32_gfx9:
- case AMDGPU::DS_READ_B64_gfx9:
- case AMDGPU::DS_WRITE_B32:
- case AMDGPU::DS_WRITE_B64:
- case AMDGPU::DS_WRITE_B32_gfx9:
- case AMDGPU::DS_WRITE_B64_gfx9:
- return ADDR;
- }
-}
-
bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = CI.I;
const unsigned Opc = CI.I->getOpcode();
- const InstClassEnum InstClass = getInstClass(Opc);
+ const InstClassEnum InstClass = getInstClass(Opc, *TII);
if (InstClass == UNKNOWN) {
return false;
}
+ const unsigned InstSubclass = getInstSubclass(Opc, *TII);
- const unsigned Regs = getRegs(Opc);
-
- unsigned AddrOpName[5] = {0};
- int AddrIdx[5];
- const MachineOperand *AddrReg[5];
- unsigned NumAddresses = 0;
-
- if (Regs & ADDR) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
- }
-
- if (Regs & SBASE) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
- }
-
- if (Regs & SRSRC) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
- }
-
- if (Regs & SOFFSET) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
- }
-
- if (Regs & VADDR) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
- }
-
- for (unsigned i = 0; i < NumAddresses; i++) {
- AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
- AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
-
- // We only ever merge operations with the same base address register, so
- // don't bother scanning forward if there are no other uses.
- if (AddrReg[i]->isReg() &&
- (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
- MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
- return false;
- }
+ // Do not merge VMEM buffer instructions with "swizzled" bit set.
+ int Swizzled =
+ AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
+ if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
+ return false;
++MBBI;
@@ -550,11 +775,10 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
for (; MBBI != E; ++MBBI) {
- const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
- if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
- (IsDS && (MBBI->getOpcode() != Opc))) {
- // This is not a matching DS instruction, but we can keep looking as
+ if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
+ (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
+ // This is not a matching instruction, but we can keep looking as
// long as one of these conditions are met:
// 1. It is safe to move I down past MBBI.
// 2. It is safe to move MBBI down past the instruction that I will
@@ -599,58 +823,23 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
CI.InstsToMove))
continue;
- bool Match = true;
- for (unsigned i = 0; i < NumAddresses; i++) {
- const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
-
- if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
- if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
- AddrReg[i]->getImm() != AddrRegNext.getImm()) {
- Match = false;
- break;
- }
- continue;
- }
-
- // Check same base pointer. Be careful of subregisters, which can occur
- // with vectors of pointers.
- if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
- AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
- Match = false;
- break;
- }
- }
+ bool Match = CI.hasSameBaseAddress(*MBBI);
if (Match) {
- int OffsetIdx =
- AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
- CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
- CI.Width0 = getOpcodeWidth(*CI.I);
- CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
- CI.Width1 = getOpcodeWidth(*MBBI);
- CI.Paired = MBBI;
-
- if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
- CI.Offset0 &= 0xffff;
- CI.Offset1 &= 0xffff;
- } else {
- CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
- CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
- if (CI.InstClass != S_BUFFER_LOAD_IMM) {
- CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
- CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
- }
- CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm();
- CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm();
- }
+ CI.setPaired(MBBI, *TII);
+
+ // Check both offsets (or masks for MIMG) can be combined and fit in the
+ // reduced range.
+ bool canBeCombined =
+ CI.InstClass == MIMG
+ ? dmasksCanBeCombined(CI, *TII)
+ : widthsFit(*STM, CI) && offsetsCanBeCombined(CI);
- // Check both offsets fit in the reduced range.
// We also need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
// instruction.
- if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
- if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
- return true;
+ if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
+ return true;
}
// We've found a load/store that we couldn't merge for some reason.
@@ -711,15 +900,15 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
const TargetRegisterClass *SuperRC =
(CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
- unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+ Register DestReg = MRI->createVirtualRegister(SuperRC);
DebugLoc DL = CI.I->getDebugLoc();
- unsigned BaseReg = AddrReg->getReg();
+ Register BaseReg = AddrReg->getReg();
unsigned BaseSubReg = AddrReg->getSubReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
- unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
.addImm(CI.BaseOff);
@@ -755,12 +944,11 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
moveInstsAfter(Copy1, CI.InstsToMove);
- MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
- return Next;
+ return Read2;
}
unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
@@ -809,11 +997,11 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = CI.I->getDebugLoc();
- unsigned BaseReg = AddrReg->getReg();
+ Register BaseReg = AddrReg->getReg();
unsigned BaseSubReg = AddrReg->getSubReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
- unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
.addImm(CI.BaseOff);
@@ -839,12 +1027,65 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
moveInstsAfter(Write2, CI.InstsToMove);
- MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
- return Next;
+ return Write2;
+}
+
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+ const unsigned Opcode = getNewOpcode(CI);
+
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+
+ Register DestReg = MRI->createVirtualRegister(SuperRC);
+ unsigned MergedDMask = CI.DMask0 | CI.DMask1;
+ unsigned DMaskIdx =
+ AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
+
+ auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
+ for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
+ if (I == DMaskIdx)
+ MIB.addImm(MergedDMask);
+ else
+ MIB.add((*CI.I).getOperand(I));
+ }
+
+ // It shouldn't be possible to get this far if the two instructions
+ // don't have a single memoperand, because MachineInstr::mayAlias()
+ // will return true if this is the case.
+ assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+
+ const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+ const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+
+ MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+ // Copy to the old destination registers.
+ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+ const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+
+ BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest0) // Copy to same destination including flags and sub reg.
+ .addReg(DestReg, 0, SubRegIdx0);
+ MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+ moveInstsAfter(Copy1, CI.InstsToMove);
+
+ CI.I->eraseFromParent();
+ CI.Paired->eraseFromParent();
+ return New;
}
MachineBasicBlock::iterator
@@ -855,15 +1096,24 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
- unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+ Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
- BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
- .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
- .addImm(MergedOffset) // offset
- .addImm(CI.GLC0) // glc
- .addImm(CI.DLC0) // dlc
- .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+ // It shouldn't be possible to get this far if the two instructions
+ // don't have a single memoperand, because MachineInstr::mayAlias()
+ // will return true if this is the case.
+ assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+
+ const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+ const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+
+ MachineInstr *New =
+ BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.DLC0) // dlc
+ .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -883,10 +1133,9 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
moveInstsAfter(Copy1, CI.InstsToMove);
- MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
- return Next;
+ return New;
}
MachineBasicBlock::iterator
@@ -899,24 +1148,34 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
// Copy to the new source register.
- unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+ Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
- const unsigned Regs = getRegs(Opcode);
+ const unsigned Regs = getRegs(Opcode, *TII);
if (Regs & VADDR)
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
- MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
- .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
- .addImm(MergedOffset) // offset
- .addImm(CI.GLC0) // glc
- .addImm(CI.SLC0) // slc
- .addImm(0) // tfe
- .addImm(CI.DLC0) // dlc
- .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+ // It shouldn't be possible to get this far if the two instructions
+ // don't have a single memoperand, because MachineInstr::mayAlias()
+ // will return true if this is the case.
+ assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+
+ const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+ const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+
+ MachineInstr *New =
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .addImm(CI.DLC0) // dlc
+ .addImm(0) // swz
+ .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -936,10 +1195,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
moveInstsAfter(Copy1, CI.InstsToMove);
- MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
- return Next;
+ return New;
}
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
@@ -947,7 +1205,10 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
switch (CI.InstClass) {
default:
- return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
+ assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
+ // FIXME: Handle d16 correctly
+ return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
+ Width);
case UNKNOWN:
llvm_unreachable("Unknown instruction class");
case S_BUFFER_LOAD_IMM:
@@ -959,76 +1220,47 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
}
+ case MIMG:
+ assert("No overlaps" && (countPopulation(CI.DMask0 | CI.DMask1) == Width));
+ return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
}
}
std::pair<unsigned, unsigned>
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
- if (CI.Offset0 > CI.Offset1) {
- switch (CI.Width0) {
- default:
- return std::make_pair(0, 0);
- case 1:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
- case 2:
- return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
- case 3:
- return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
- }
- case 2:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
- case 2:
- return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
- }
- case 3:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
- }
- }
+
+ if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4)
+ return std::make_pair(0, 0);
+
+ bool ReverseOrder;
+ if (CI.InstClass == MIMG) {
+ assert((countPopulation(CI.DMask0 | CI.DMask1) == CI.Width0 + CI.Width1) &&
+ "No overlaps");
+ ReverseOrder = CI.DMask0 > CI.DMask1;
+ } else
+ ReverseOrder = CI.Offset0 > CI.Offset1;
+
+ static const unsigned Idxs[4][4] = {
+ {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
+ {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
+ {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
+ {AMDGPU::sub3, 0, 0, 0},
+ };
+ unsigned Idx0;
+ unsigned Idx1;
+
+ assert(CI.Width0 >= 1 && CI.Width0 <= 3);
+ assert(CI.Width1 >= 1 && CI.Width1 <= 3);
+
+ if (ReverseOrder) {
+ Idx1 = Idxs[0][CI.Width1 - 1];
+ Idx0 = Idxs[CI.Width1][CI.Width0 - 1];
} else {
- switch (CI.Width0) {
- default:
- return std::make_pair(0, 0);
- case 1:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
- case 2:
- return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
- case 3:
- return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
- }
- case 2:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
- case 2:
- return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
- }
- case 3:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
- }
- }
+ Idx0 = Idxs[0][CI.Width0 - 1];
+ Idx1 = Idxs[CI.Width0][CI.Width1 - 1];
}
+
+ return std::make_pair(Idx0, Idx1);
}
const TargetRegisterClass *
@@ -1040,7 +1272,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
case 2:
return &AMDGPU::SReg_64_XEXECRegClass;
case 4:
- return &AMDGPU::SReg_128RegClass;
+ return &AMDGPU::SGPR_128RegClass;
case 8:
return &AMDGPU::SReg_256RegClass;
case 16:
@@ -1073,7 +1305,7 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
// Copy to the new source register.
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
- unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
+ Register SrcReg = MRI->createVirtualRegister(SuperRC);
const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
@@ -1087,35 +1319,45 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
- const unsigned Regs = getRegs(Opcode);
+ const unsigned Regs = getRegs(Opcode, *TII);
if (Regs & VADDR)
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
- MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
- .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
- .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
- .addImm(CI.GLC0) // glc
- .addImm(CI.SLC0) // slc
- .addImm(0) // tfe
- .addImm(CI.DLC0) // dlc
- .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+
+ // It shouldn't be possible to get this far if the two instructions
+ // don't have a single memoperand, because MachineInstr::mayAlias()
+ // will return true if this is the case.
+ assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+
+ const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+ const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+
+ MachineInstr *New =
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+ .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .addImm(CI.DLC0) // dlc
+ .addImm(0) // swz
+ .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
moveInstsAfter(MIB, CI.InstsToMove);
- MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
- return Next;
+ return New;
}
MachineOperand
-SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
+SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
APInt V(32, Val, true);
if (TII->isInlineConstant(V))
return MachineOperand::CreateImm(Val);
- unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineInstr *Mov =
BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::S_MOV_B32), Reg)
@@ -1127,7 +1369,7 @@ SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
// Compute base address using Addr and return the final register.
unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
- const MemAddress &Addr) {
+ const MemAddress &Addr) const {
MachineBasicBlock *MBB = MI.getParent();
MachineBasicBlock::iterator MBBI = MI.getIterator();
DebugLoc DL = MI.getDebugLoc();
@@ -1146,11 +1388,11 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned CarryReg = MRI->createVirtualRegister(CarryRC);
- unsigned DeadCarryReg = MRI->createVirtualRegister(CarryRC);
+ Register CarryReg = MRI->createVirtualRegister(CarryRC);
+ Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
- unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *LoHalf =
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
.addReg(CarryReg, RegState::Define)
@@ -1170,7 +1412,7 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
(void)HiHalf;
LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
- unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
MachineInstr *FullBase =
BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
@@ -1186,13 +1428,13 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
// Update base and offset with the NewBase and NewOffset in MI.
void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
unsigned NewBase,
- int32_t NewOffset) {
+ int32_t NewOffset) const {
TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
}
Optional<int32_t>
-SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
+SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
if (Op.isImm())
return Op.getImm();
@@ -1218,7 +1460,7 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
// %Base:vreg_64 =
// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
- MemAddress &Addr) {
+ MemAddress &Addr) const {
if (!Base.isReg())
return;
@@ -1273,15 +1515,16 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
MachineInstr &MI,
MemInfoMap &Visited,
- SmallPtrSet<MachineInstr *, 4> &AnchorList) {
+ SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
+
+ if (!(MI.mayLoad() ^ MI.mayStore()))
+ return false;
// TODO: Support flat and scratch.
- if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
- TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
+ if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
return false;
- // TODO: Support Store.
- if (!MI.mayLoad())
+ if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
return false;
if (AnchorList.count(&MI))
@@ -1418,100 +1661,166 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
return false;
}
-// Scan through looking for adjacent LDS operations with constant offsets from
-// the same base register. We rely on the scheduler to do the hard work of
-// clustering nearby loads, and assume these are all adjacent.
-bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
- bool Modified = false;
+void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
+ std::list<std::list<CombineInfo> > &MergeableInsts) const {
+ for (std::list<CombineInfo> &AddrList : MergeableInsts) {
+ if (AddrList.front().hasSameBaseAddress(*CI.I) &&
+ AddrList.front().InstClass == CI.InstClass) {
+ AddrList.emplace_back(CI);
+ return;
+ }
+ }
+
+ // Base address not found, so add a new list.
+ MergeableInsts.emplace_back(1, CI);
+}
+bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
+ std::list<std::list<CombineInfo> > &MergeableInsts) const {
+ bool Modified = false;
// Contain the list
MemInfoMap Visited;
// Contains the list of instructions for which constant offsets are being
// promoted to the IMM.
SmallPtrSet<MachineInstr *, 4> AnchorList;
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
- MachineInstr &MI = *I;
-
+ // Sort potential mergeable instructions into lists. One list per base address.
+ for (MachineInstr &MI : MBB.instrs()) {
+ // We run this before checking if an address is mergeable, because it can produce
+ // better code even if the instructions aren't mergeable.
if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
Modified = true;
+ const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
+ if (InstClass == UNKNOWN)
+ continue;
+
// Don't combine if volatile.
- if (MI.hasOrderedMemoryRef()) {
- ++I;
+ if (MI.hasOrderedMemoryRef())
+ continue;
+
+ CombineInfo CI;
+ CI.setMI(MI, *TII, *STM);
+
+ if (!CI.hasMergeableAddress(*MRI))
+ continue;
+
+ addInstToMergeableList(CI, MergeableInsts);
+ }
+ return Modified;
+}
+
+// Scan through looking for adjacent LDS operations with constant offsets from
+// the same base register. We rely on the scheduler to do the hard work of
+// clustering nearby loads, and assume these are all adjacent.
+bool SILoadStoreOptimizer::optimizeBlock(
+ std::list<std::list<CombineInfo> > &MergeableInsts) {
+ bool Modified = false;
+
+ for (std::list<CombineInfo> &MergeList : MergeableInsts) {
+ if (MergeList.size() < 2)
+ continue;
+
+ bool OptimizeListAgain = false;
+ if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
+ // We weren't able to make any changes, so clear the list so we don't
+ // process the same instructions the next time we try to optimize this
+ // block.
+ MergeList.clear();
continue;
}
- const unsigned Opc = MI.getOpcode();
+ // We made changes, but also determined that there were no more optimization
+ // opportunities, so we don't need to reprocess the list
+ if (!OptimizeListAgain)
+ MergeList.clear();
- CombineInfo CI;
- CI.I = I;
- CI.InstClass = getInstClass(Opc);
+ OptimizeAgain |= OptimizeListAgain;
+ Modified = true;
+ }
+ return Modified;
+}
+
+void
+SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
+ const MachineInstr &MI) {
+
+ for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
+ if (&*CI->I == &MI) {
+ MergeList.erase(CI);
+ return;
+ }
+ }
+}
+
+bool
+SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
+ std::list<CombineInfo> &MergeList,
+ bool &OptimizeListAgain) {
+ bool Modified = false;
+ for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
+ CombineInfo &CI = *I;
switch (CI.InstClass) {
default:
break;
case DS_READ:
- CI.EltSize =
- (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
- : 4;
if (findMatchingInst(CI)) {
Modified = true;
- I = mergeRead2Pair(CI);
- } else {
- ++I;
+ removeCombinedInst(MergeList, *CI.Paired);
+ MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
+ CI.setMI(NewMI, *TII, *STM);
}
- continue;
+ break;
case DS_WRITE:
- CI.EltSize =
- (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
- : 4;
if (findMatchingInst(CI)) {
Modified = true;
- I = mergeWrite2Pair(CI);
- } else {
- ++I;
+ removeCombinedInst(MergeList, *CI.Paired);
+ MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
+ CI.setMI(NewMI, *TII, *STM);
}
- continue;
+ break;
case S_BUFFER_LOAD_IMM:
- CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
if (findMatchingInst(CI)) {
Modified = true;
- I = mergeSBufferLoadImmPair(CI);
- OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
- } else {
- ++I;
+ removeCombinedInst(MergeList, *CI.Paired);
+ MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
+ CI.setMI(NewMI, *TII, *STM);
+ OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
}
- continue;
- case BUFFER_LOAD_OFFEN:
- case BUFFER_LOAD_OFFSET:
- case BUFFER_LOAD_OFFEN_exact:
- case BUFFER_LOAD_OFFSET_exact:
- CI.EltSize = 4;
+ break;
+ case BUFFER_LOAD:
if (findMatchingInst(CI)) {
Modified = true;
- I = mergeBufferLoadPair(CI);
- OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
- } else {
- ++I;
+ removeCombinedInst(MergeList, *CI.Paired);
+ MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
+ CI.setMI(NewMI, *TII, *STM);
+ OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
}
- continue;
- case BUFFER_STORE_OFFEN:
- case BUFFER_STORE_OFFSET:
- case BUFFER_STORE_OFFEN_exact:
- case BUFFER_STORE_OFFSET_exact:
- CI.EltSize = 4;
+ break;
+ case BUFFER_STORE:
if (findMatchingInst(CI)) {
Modified = true;
- I = mergeBufferStorePair(CI);
- OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
- } else {
- ++I;
+ removeCombinedInst(MergeList, *CI.Paired);
+ MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
+ CI.setMI(NewMI, *TII, *STM);
+ OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
}
- continue;
+ break;
+ case MIMG:
+ if (findMatchingInst(CI)) {
+ Modified = true;
+ removeCombinedInst(MergeList, *CI.Paired);
+ MachineBasicBlock::iterator NewMI = mergeImagePair(CI);
+ CI.setMI(NewMI, *TII, *STM);
+ OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
+ }
+ break;
}
-
- ++I;
+ // Clear the InstsToMove after we have finished searching so we don't have
+ // stale values left over if we search for this CI again in another pass
+ // over the block.
+ CI.InstsToMove.clear();
}
return Modified;
@@ -1537,10 +1846,14 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
bool Modified = false;
+
for (MachineBasicBlock &MBB : MF) {
+ std::list<std::list<CombineInfo> > MergeableInsts;
+ // First pass: Collect list of all instructions we know how to merge.
+ Modified |= collectMergeableInsts(MBB, MergeableInsts);
do {
OptimizeAgain = false;
- Modified |= optimizeBlock(MBB);
+ Modified |= optimizeBlock(MergeableInsts);
} while (OptimizeAgain);
}
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 78f409cd9555..6f9abd3a8d9b 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -98,6 +98,8 @@ private:
void emitLoop(MachineInstr &MI);
void emitEndCf(MachineInstr &MI);
+ Register getSaveExec(MachineInstr* MI);
+
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const;
@@ -144,7 +146,7 @@ char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
const SIInstrInfo *TII) {
- unsigned SaveExecReg = MI.getOperand(0).getReg();
+ Register SaveExecReg = MI.getOperand(0).getReg();
auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
if (U == MRI->use_instr_nodbg_end() ||
@@ -175,17 +177,31 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
return true;
}
+Register SILowerControlFlow::getSaveExec(MachineInstr *MI) {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineOperand &SaveExec = MI->getOperand(0);
+ assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister);
+
+ Register SaveExecReg = SaveExec.getReg();
+ unsigned FalseTermOpc =
+ TII->isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
+ MachineBasicBlock::iterator I = (MI);
+ MachineBasicBlock::iterator J = std::next(I);
+ if (J != MBB->end() && J->getOpcode() == FalseTermOpc &&
+ J->getOperand(1).isReg() && J->getOperand(1).getReg() == SaveExecReg) {
+ SaveExecReg = J->getOperand(0).getReg();
+ J->eraseFromParent();
+ }
+ return SaveExecReg;
+}
+
void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
-
- MachineOperand &SaveExec = MI.getOperand(0);
- MachineOperand &Cond = MI.getOperand(1);
- assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
- Cond.getSubReg() == AMDGPU::NoSubRegister);
-
- Register SaveExecReg = SaveExec.getReg();
+ Register SaveExecReg = getSaveExec(&MI);
+ MachineOperand& Cond = MI.getOperand(1);
+ assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
MachineOperand &ImpDefSCC = MI.getOperand(4);
assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
@@ -204,7 +220,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
.addReg(Exec)
.addReg(Exec, RegState::ImplicitDefine);
- unsigned Tmp = MRI->createVirtualRegister(BoolRC);
+ Register Tmp = MRI->createVirtualRegister(BoolRC);
MachineInstr *And =
BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
@@ -266,8 +282,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- Register DstReg = MI.getOperand(0).getReg();
- assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
+ Register DstReg = getSaveExec(&MI);
bool ExecModified = MI.getOperand(3).getImm() != 0;
MachineBasicBlock::iterator Start = MBB.begin();
@@ -339,7 +354,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- auto Dst = MI.getOperand(0).getReg();
+ auto Dst = getSaveExec(&MI);
// Skip ANDing with exec if the break condition is already masked by exec
// because it is a V_CMP in the same basic block. (We know the break
@@ -400,13 +415,17 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ unsigned CFMask = MI.getOperand(0).getReg();
+ MachineInstr *Def = MRI.getUniqueVRegDef(CFMask);
const DebugLoc &DL = MI.getDebugLoc();
- MachineBasicBlock::iterator InsPt = MBB.begin();
- MachineInstr *NewMI =
- BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec)
- .addReg(Exec)
- .add(MI.getOperand(0));
+ MachineBasicBlock::iterator InsPt =
+ Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def))
+ : MBB.begin();
+ MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec)
+ .addReg(Exec)
+ .add(MI.getOperand(0));
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
@@ -422,7 +441,7 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const {
MachineOperand &Op = MI.getOperand(OpNo);
- if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+ if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) {
Src.push_back(Op);
return;
}
@@ -442,8 +461,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
for (const auto &SrcOp : Def->explicit_operands())
if (SrcOp.isReg() && SrcOp.isUse() &&
- (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
- SrcOp.getReg() == Exec))
+ (Register::isVirtualRegister(SrcOp.getReg()) || SrcOp.getReg() == Exec))
Src.push_back(SrcOp);
}
@@ -466,7 +484,7 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
else return;
- unsigned Reg = MI.getOperand(OpToReplace).getReg();
+ Register Reg = MI.getOperand(OpToReplace).getReg();
MI.RemoveOperand(OpToReplace);
MI.addOperand(Ops[UniqueOpndIdx]);
if (MRI->use_empty(Reg))
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 1c0f836f07e6..b45412536356 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -96,7 +96,7 @@ private:
getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
bool isVreg1(unsigned Reg) const {
- return TargetRegisterInfo::isVirtualRegister(Reg) &&
+ return Register::isVirtualRegister(Reg) &&
MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
}
@@ -489,6 +489,15 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
return true;
}
+#ifndef NDEBUG
+static bool isVRegCompatibleReg(const SIRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI,
+ Register Reg) {
+ unsigned Size = TRI.getRegSizeInBits(Reg, MRI);
+ return Size == 1 || Size == 32;
+}
+#endif
+
void SILowerI1Copies::lowerCopiesFromI1() {
SmallVector<MachineInstr *, 4> DeadCopies;
@@ -497,8 +506,8 @@ void SILowerI1Copies::lowerCopiesFromI1() {
if (MI.getOpcode() != AMDGPU::COPY)
continue;
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
if (!isVreg1(SrcReg))
continue;
@@ -509,7 +518,7 @@ void SILowerI1Copies::lowerCopiesFromI1() {
LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
DebugLoc DL = MI.getDebugLoc();
- assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
+ assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg));
assert(!MI.getOperand(0).getSubReg());
ConstrainRegs.insert(SrcReg);
@@ -544,7 +553,7 @@ void SILowerI1Copies::lowerPhis() {
LF.initialize(MBB);
for (MachineInstr &MI : MBB.phis()) {
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
if (!isVreg1(DstReg))
continue;
@@ -556,7 +565,7 @@ void SILowerI1Copies::lowerPhis() {
// Collect incoming values.
for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
assert(i + 1 < MI.getNumOperands());
- unsigned IncomingReg = MI.getOperand(i).getReg();
+ Register IncomingReg = MI.getOperand(i).getReg();
MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
@@ -580,12 +589,12 @@ void SILowerI1Copies::lowerPhis() {
// Phis in a loop that are observed outside the loop receive a simple but
// conservatively correct treatment.
- MachineBasicBlock *PostDomBound = &MBB;
- for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
- PostDomBound =
- PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
- }
+ std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
+ for (MachineInstr &Use : MRI->use_instructions(DstReg))
+ DomBlocks.push_back(Use.getParent());
+ MachineBasicBlock *PostDomBound =
+ PDT->findNearestCommonDominator(DomBlocks);
unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
SSAUpdater.Initialize(DstReg);
@@ -669,7 +678,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
MI.getOpcode() != AMDGPU::COPY)
continue;
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
if (!isVreg1(DstReg))
continue;
@@ -686,10 +695,10 @@ void SILowerI1Copies::lowerCopiesToI1() {
continue;
DebugLoc DL = MI.getDebugLoc();
- unsigned SrcReg = MI.getOperand(1).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
assert(!MI.getOperand(1).getSubReg());
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+ if (!Register::isVirtualRegister(SrcReg) ||
(!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
unsigned TmpReg = createLaneMaskReg(*MF);
@@ -702,12 +711,12 @@ void SILowerI1Copies::lowerCopiesToI1() {
// Defs in a loop that are observed outside the loop must be transformed
// into appropriate bit manipulation.
- MachineBasicBlock *PostDomBound = &MBB;
- for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
- PostDomBound =
- PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
- }
+ std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
+ for (MachineInstr &Use : MRI->use_instructions(DstReg))
+ DomBlocks.push_back(Use.getParent());
+ MachineBasicBlock *PostDomBound =
+ PDT->findNearestCommonDominator(DomBlocks);
unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
if (FoundLoopLevel) {
SSAUpdater.Initialize(DstReg);
@@ -734,7 +743,7 @@ bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
break;
Reg = MI->getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
return false;
if (!isLaneMaskReg(Reg))
return false;
diff --git a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index a82047473370..714d403a3e8f 100644
--- a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -278,8 +278,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vaddr);
int FI = MI.getOperand(FIOp).getIndex();
- unsigned VReg = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)
- ->getReg();
+ Register VReg =
+ TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
TRI->isAGPR(MRI, VReg))) {
TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr);
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 46da974a2f45..7dd0f11c95de 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -53,8 +53,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
- Occupancy = getMaxWavesPerEU();
- limitOccupancy(MF);
+ Occupancy = ST.computeOccupancy(MF, getLDSSize());
CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
@@ -190,7 +189,7 @@ unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
const SIRegisterInfo &TRI) {
ArgInfo.PrivateSegmentBuffer =
ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass));
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass));
NumUserSGPRs += 4;
return ArgInfo.PrivateSegmentBuffer.getRegister();
}
@@ -487,6 +486,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
MemoryBound(MFI.isMemoryBound()),
WaveLimiter(MFI.needsWaveLimiter()),
+ HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)),
FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
@@ -501,8 +501,9 @@ void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
bool SIMachineFunctionInfo::initializeBaseYamlFields(
const yaml::SIMachineFunctionInfo &YamlMFI) {
ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
- MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
+ MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign);
LDSSize = YamlMFI.LDSSize;
+ HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
IsEntryFunction = YamlMFI.IsEntryFunction;
NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
MemoryBound = YamlMFI.MemoryBound;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index f19b20ceb5da..7d70c786b594 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -265,6 +265,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
bool NoSignedZerosFPMath = false;
bool MemoryBound = false;
bool WaveLimiter = false;
+ uint32_t HighBitsOf32BitAddress = 0;
StringValue ScratchRSrcReg = "$private_rsrc_reg";
StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg";
@@ -302,6 +303,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
StringValue("$sp_reg"));
YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
YamlIO.mapOptional("mode", MFI.Mode, SIMode());
+ YamlIO.mapOptional("highBitsOf32BitAddress",
+ MFI.HighBitsOf32BitAddress, 0u);
}
};
@@ -670,7 +673,7 @@ public:
return GITPtrHigh;
}
- unsigned get32BitAddressHighBits() const {
+ uint32_t get32BitAddressHighBits() const {
return HighBitsOf32BitAddress;
}
@@ -873,7 +876,7 @@ public:
assert(BufferRsrc);
auto PSV = BufferPSVs.try_emplace(
BufferRsrc,
- llvm::make_unique<AMDGPUBufferPseudoSourceValue>(TII));
+ std::make_unique<AMDGPUBufferPseudoSourceValue>(TII));
return PSV.first->second.get();
}
@@ -882,14 +885,14 @@ public:
assert(ImgRsrc);
auto PSV = ImagePSVs.try_emplace(
ImgRsrc,
- llvm::make_unique<AMDGPUImagePseudoSourceValue>(TII));
+ std::make_unique<AMDGPUImagePseudoSourceValue>(TII));
return PSV.first->second.get();
}
const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
if (!GWSResourcePSV) {
GWSResourcePSV =
- llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
+ std::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
}
return GWSResourcePSV.get();
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index ebbdf80f9567..c072ba6b2d1c 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -348,7 +348,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
// Do not Track Physical Registers, because it messes up.
for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
- if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
+ if (Register::isVirtualRegister(RegMaskPair.RegUnit))
LiveInRegs.insert(RegMaskPair.RegUnit);
}
LiveOutRegs.clear();
@@ -376,7 +376,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
// The use of findDefBetween removes the case 4.
for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
unsigned Reg = RegMaskPair.RegUnit;
- if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+ if (Register::isVirtualRegister(Reg) &&
isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
LIS)) {
@@ -1228,7 +1228,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
unsigned Color = CurrentColoring[SU->NodeNum];
if (RealID.find(Color) == RealID.end()) {
int ID = CurrentBlocks.size();
- BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
+ BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID));
CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
RealID[Color] = ID;
}
@@ -1690,7 +1690,7 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
for (unsigned Reg : Regs) {
// For now only track virtual registers.
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
continue;
// If not already in the live set, then add it.
(void) LiveRegs.insert(Reg);
@@ -1750,7 +1750,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
for (unsigned Reg : InRegs) {
// For now only track virtual registers.
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
continue;
if (LiveRegsConsumers[Reg] > 1)
continue;
@@ -1762,7 +1762,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
for (unsigned Reg : OutRegs) {
// For now only track virtual registers.
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
continue;
PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
for (; PSetI.isValid(); ++PSetI) {
@@ -1801,7 +1801,7 @@ SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
// SIScheduleDAGMI //
SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
- ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
+ ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) {
SITII = static_cast<const SIInstrInfo*>(TII);
SITRI = static_cast<const SIRegisterInfo*>(TRI);
@@ -1913,7 +1913,7 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
for (_Iterator RegI = First; RegI != End; ++RegI) {
unsigned Reg = *RegI;
// For now only track virtual registers
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
continue;
PSetIterator PSetI = MRI.getPressureSets(Reg);
for (; PSetI.isValid(); ++PSetI) {
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 4320e6c957a0..e914573306ae 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -656,10 +656,10 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) {
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
- return make_unique<SIGfx6CacheControl>(ST);
+ return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX10)
- return make_unique<SIGfx7CacheControl>(ST);
- return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
+ return std::make_unique<SIGfx7CacheControl>(ST);
+ return std::make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
}
bool SIGfx6CacheControl::enableLoadCacheBypass(
diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp
index a5edd7b3554a..52989a280e80 100644
--- a/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -226,7 +226,7 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
// - on exit we have set the Require, Change, and initial Exit modes.
void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
const SIInstrInfo *TII) {
- auto NewInfo = llvm::make_unique<BlockData>();
+ auto NewInfo = std::make_unique<BlockData>();
MachineInstr *InsertionPoint = nullptr;
// RequirePending is used to indicate whether we are collecting the initial
// requirements for the block, and need to defer the first InsertionPoint to
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 3227bff20513..cc9b46a75582 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -322,7 +322,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
continue;
}
- unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
+ Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
MachineInstr *SaveExecInst = nullptr;
SmallVector<MachineInstr *, 4> OtherUseInsts;
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 7e10316eab92..fdd30db6a7cb 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -211,7 +211,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
return AMDGPU::NoRegister;
MachineOperand *AndCC = &And->getOperand(1);
- unsigned CmpReg = AndCC->getReg();
+ Register CmpReg = AndCC->getReg();
unsigned CmpSubReg = AndCC->getSubReg();
if (CmpReg == ExecReg) {
AndCC = &And->getOperand(2);
@@ -234,7 +234,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
return AMDGPU::NoRegister;
- unsigned SelReg = Op1->getReg();
+ Register SelReg = Op1->getReg();
auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS);
if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
return AMDGPU::NoRegister;
@@ -250,15 +250,16 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
Op1->getImm() != 0 || Op2->getImm() != 1)
return AMDGPU::NoRegister;
- LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t'
- << *Cmp << '\t' << *And);
+ LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
+ << *And);
- unsigned CCReg = CC->getReg();
+ Register CCReg = CC->getReg();
LIS->RemoveMachineInstrFromMaps(*And);
- MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(),
- TII->get(Andn2Opc), And->getOperand(0).getReg())
- .addReg(ExecReg)
- .addReg(CCReg, 0, CC->getSubReg());
+ MachineInstr *Andn2 =
+ BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc),
+ And->getOperand(0).getReg())
+ .addReg(ExecReg)
+ .addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg());
And->eraseFromParent();
LIS->InsertMachineInstrInMaps(*Andn2);
@@ -266,20 +267,19 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
// Try to remove compare. Cmp value should not used in between of cmp
// and s_and_b64 if VCC or just unused if any other register.
- if ((TargetRegisterInfo::isVirtualRegister(CmpReg) &&
- MRI.use_nodbg_empty(CmpReg)) ||
+ if ((Register::isVirtualRegister(CmpReg) && MRI.use_nodbg_empty(CmpReg)) ||
(CmpReg == CondReg &&
std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
[&](const MachineInstr &MI) {
- return MI.readsRegister(CondReg, TRI); }))) {
+ return MI.readsRegister(CondReg, TRI);
+ }))) {
LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
LIS->RemoveMachineInstrFromMaps(*Cmp);
Cmp->eraseFromParent();
// Try to remove v_cndmask_b32.
- if (TargetRegisterInfo::isVirtualRegister(SelReg) &&
- MRI.use_nodbg_empty(SelReg)) {
+ if (Register::isVirtualRegister(SelReg) && MRI.use_nodbg_empty(SelReg)) {
LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
LIS->RemoveMachineInstrFromMaps(*Sel);
@@ -413,7 +413,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (!SaveExec || !SaveExec->isFullCopy())
continue;
- unsigned SavedExec = SaveExec->getOperand(0).getReg();
+ Register SavedExec = SaveExec->getOperand(0).getReg();
bool SafeToReplace = true;
for (auto& U : MRI.use_nodbg_instructions(SavedExec)) {
if (U.getParent() != SaveExec->getParent()) {
@@ -434,7 +434,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (Changed) {
for (auto Reg : RecalcRegs) {
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
LIS->removeInterval(Reg);
if (!MRI.reg_empty(Reg))
LIS->createAndComputeVirtRegInterval(Reg);
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 2d71abc0612a..9b3b2436475c 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -574,16 +574,16 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (TRI->isPhysicalRegister(Src1->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
+ if (Register::isPhysicalRegister(Src1->getReg()) ||
+ Register::isPhysicalRegister(Dst->getReg()))
break;
if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
Opcode == AMDGPU::V_LSHLREV_B32_e64) {
- return make_unique<SDWADstOperand>(
+ return std::make_unique<SDWADstOperand>(
Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
} else {
- return make_unique<SDWASrcOperand>(
+ return std::make_unique<SDWASrcOperand>(
Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
Opcode != AMDGPU::V_LSHRREV_B32_e64);
@@ -613,15 +613,15 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (TRI->isPhysicalRegister(Src1->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
+ if (Register::isPhysicalRegister(Src1->getReg()) ||
+ Register::isPhysicalRegister(Dst->getReg()))
break;
if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
Opcode == AMDGPU::V_LSHLREV_B16_e64) {
- return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
+ return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
} else {
- return make_unique<SDWASrcOperand>(
+ return std::make_unique<SDWASrcOperand>(
Src1, Dst, BYTE_1, false, false,
Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
Opcode != AMDGPU::V_LSHRREV_B16_e64);
@@ -677,11 +677,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (TRI->isPhysicalRegister(Src0->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
+ if (Register::isPhysicalRegister(Src0->getReg()) ||
+ Register::isPhysicalRegister(Dst->getReg()))
break;
- return make_unique<SDWASrcOperand>(
+ return std::make_unique<SDWASrcOperand>(
Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
}
@@ -706,11 +706,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
+ if (Register::isPhysicalRegister(ValSrc->getReg()) ||
+ Register::isPhysicalRegister(Dst->getReg()))
break;
- return make_unique<SDWASrcOperand>(
+ return std::make_unique<SDWASrcOperand>(
ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
}
@@ -840,7 +840,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
assert(OrDst && OrDst->isReg());
- return make_unique<SDWADstPreserveOperand>(
+ return std::make_unique<SDWADstPreserveOperand>(
OrDst, OrSDWADef, OrOtherDef, DstSel);
}
@@ -1189,7 +1189,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
continue;
}
- unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
if (Op.isImm())
diff --git a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index f9bfe96f65cb..6cdd12d0e7bd 100644
--- a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -90,12 +90,12 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
if (!MO.isReg())
return false;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (!TRI->isVGPR(*MRI, Reg))
return false;
- if (TRI->isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return false;
if (VRM->hasPhys(Reg))
@@ -124,14 +124,14 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
if (!MO.isReg())
continue;
- const unsigned VirtReg = MO.getReg();
- if (TRI->isPhysicalRegister(VirtReg))
+ const Register VirtReg = MO.getReg();
+ if (Register::isPhysicalRegister(VirtReg))
continue;
if (!VRM->hasPhys(VirtReg))
continue;
- unsigned PhysReg = VRM->getPhys(VirtReg);
+ Register PhysReg = VRM->getPhys(VirtReg);
const unsigned SubReg = MO.getSubReg();
if (SubReg != 0) {
PhysReg = TRI->getSubReg(PhysReg, SubReg);
@@ -149,7 +149,7 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
for (unsigned Reg : RegsToRewrite) {
LIS->removeInterval(Reg);
- const unsigned PhysReg = VRM->getPhys(Reg);
+ const Register PhysReg = VRM->getPhys(Reg);
assert(PhysReg != 0);
MFI->ReserveWWMRegister(PhysReg);
}
diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h
index 168f05f8fdd6..7c039a54b57f 100644
--- a/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/lib/Target/AMDGPU/SIProgramInfo.h
@@ -41,6 +41,8 @@ struct SIProgramInfo {
uint64_t ComputePGMRSrc2 = 0;
uint32_t NumVGPR = 0;
+ uint32_t NumArchVGPR = 0;
+ uint32_t NumAccVGPR = 0;
uint32_t NumSGPR = 0;
uint32_t LDSSize = 0;
bool FlatUsed = false;
@@ -51,6 +53,9 @@ struct SIProgramInfo {
// Number of VGPRs that meets number of waves per execution unit request.
uint32_t NumVGPRsForWavesPerEU = 0;
+ // Final occupancy.
+ uint32_t Occupancy = 0;
+
// Whether there is recursion, dynamic allocas, indirect calls or some other
// reason there may be statically unknown stack usage.
bool DynamicCallStack = false;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index f152deb28004..f58bc3060c42 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -48,11 +48,6 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
}
}
-static cl::opt<bool> EnableSpillSGPRToSMEM(
- "amdgpu-spill-sgpr-to-smem",
- cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
- cl::init(false));
-
static cl::opt<bool> EnableSpillSGPRToVGPR(
"amdgpu-spill-sgpr-to-vgpr",
cl::desc("Enable spilling VGPRs to SGPRs"),
@@ -61,17 +56,12 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
AMDGPURegisterInfo(),
+ ST(ST),
SGPRPressureSets(getNumRegPressureSets()),
VGPRPressureSets(getNumRegPressureSets()),
AGPRPressureSets(getNumRegPressureSets()),
- SpillSGPRToVGPR(false),
- SpillSGPRToSMEM(false),
+ SpillSGPRToVGPR(EnableSpillSGPRToVGPR),
isWave32(ST.isWave32()) {
- if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
- SpillSGPRToSMEM = true;
- else if (EnableSpillSGPRToVGPR)
- SpillSGPRToVGPR = true;
-
unsigned NumRegPressureSets = getNumRegPressureSets();
SGPRSetID = NumRegPressureSets;
@@ -118,11 +108,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
- return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
}
static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
@@ -144,7 +132,6 @@ static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
return AMDGPU::SGPR_32RegClass.getRegister(Reg);
}
@@ -202,8 +189,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(AMDGPU::VCC_HI);
}
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-
unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
@@ -220,6 +205,14 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, Reg);
}
+ // Reserve all the rest AGPRs if there are no instructions to use it.
+ if (!ST.hasMAIInsts()) {
+ for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
+ unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
+ }
+ }
+
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
@@ -293,32 +286,17 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const
bool SIRegisterInfo::requiresFrameIndexScavenging(
const MachineFunction &MF) const {
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- if (MFI.hasStackObjects())
- return true;
-
- // May need to deal with callee saved registers.
- const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- return !Info->isEntryFunction();
+ // Do not use frame virtual registers. They used to be used for SGPRs, but
+ // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
+ // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
+ // spill.
+ return false;
}
bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
- if (!MFI.hasStackObjects())
- return false;
-
- // The scavenger is used for large frames which may require finding a free
- // register for large offsets.
- if (!isUInt<12>(MFI.getStackSize()))
- return true;
-
- // If using scalar stores, for spills, m0 is needed for the scalar store
- // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual
- // register for it during frame index elimination, so the scavenger is
- // directly needed.
- return MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
- MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
+ return MFI.hasStackObjects();
}
bool SIRegisterInfo::requiresVirtualBaseRegisters(
@@ -372,8 +350,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
DL = Ins->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = Subtarget.getInstrInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
if (Offset == 0) {
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
@@ -382,9 +359,9 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
}
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(Offset);
@@ -399,11 +376,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
int64_t Offset) const {
-
- MachineBasicBlock *MBB = MI.getParent();
- MachineFunction *MF = MBB->getParent();
- const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = Subtarget.getInstrInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
#ifndef NDEBUG
// FIXME: Is it possible to be storing a frame index to itself?
@@ -419,12 +392,15 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
#endif
MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+#ifndef NDEBUG
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+#endif
assert(FIOp && FIOp->isFI() && "frame index must be address operand");
assert(TII->isMUBUF(MI));
assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
- MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
- "should only be seeing frame offset relative FrameIndex");
-
+ MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg() &&
+ "should only be seeing stack pointer offset relative FrameIndex");
MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
int64_t NewOffset = OffsetOp->getImm() + Offset;
@@ -564,7 +540,8 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
}
}
-static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
+static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
+ MachineBasicBlock::iterator MI,
int Index,
unsigned Lane,
unsigned ValueReg,
@@ -572,7 +549,6 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
@@ -595,11 +571,12 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
// need to handle the case where an SGPR may need to be spilled while spilling.
-static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
+static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
MachineFrameInfo &MFI,
MachineBasicBlock::iterator MI,
int Index,
int64_t Offset) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
MachineBasicBlock *MBB = MI->getParent();
const DebugLoc &DL = MI->getDebugLoc();
bool IsStore = MI->mayStore();
@@ -611,7 +588,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
return false;
const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
- if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr())
+ if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr())
return true;
MachineInstrBuilder NewMI =
@@ -624,6 +601,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.cloneMemRefs(*MI);
const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
@@ -645,7 +623,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
RegScavenger *RS) const {
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const MachineFrameInfo &MFI = MF->getFrameInfo();
@@ -707,8 +684,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
}
for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
- unsigned SubReg = NumSubRegs == 1 ?
- ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
+ Register SubReg = NumSubRegs == 1
+ ? Register(ValueReg)
+ : getSubReg(ValueReg, getSubRegFromChannel(i));
unsigned SOffsetRegState = 0;
unsigned SrcDstRegState = getDefRegState(!IsStore);
@@ -718,7 +696,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
SrcDstRegState |= getKillRegState(IsKill);
}
- auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill);
+ auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill);
if (!MIB.getInstr()) {
unsigned FinalReg = SubReg;
@@ -743,6 +721,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.addMemOperand(NewMMO);
if (!IsStore && TmpReg != AMDGPU::NoRegister)
@@ -763,22 +742,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
}
}
-static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
- bool Store) {
- if (SuperRegSize % 16 == 0) {
- return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
- AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
- }
-
- if (SuperRegSize % 8 == 0) {
- return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
- AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
- }
-
- return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
- AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
-}
-
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS,
@@ -794,98 +757,37 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
if (OnlyToVGPR && !SpillToVGPR)
return false;
- MachineRegisterInfo &MRI = MF->getRegInfo();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- unsigned SuperReg = MI->getOperand(0).getReg();
+ Register SuperReg = MI->getOperand(0).getReg();
bool IsKill = MI->getOperand(0).isKill();
const DebugLoc &DL = MI->getDebugLoc();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- bool SpillToSMEM = spillSGPRToSMEM();
- if (SpillToSMEM && OnlyToVGPR)
- return false;
-
- Register FrameReg = getFrameRegister(*MF);
-
assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
SuperReg != MFI->getFrameOffsetReg() &&
SuperReg != MFI->getScratchWaveOffsetReg()));
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
- unsigned OffsetReg = AMDGPU::M0;
unsigned M0CopyReg = AMDGPU::NoRegister;
- if (SpillToSMEM) {
- if (RS->isRegUsed(AMDGPU::M0)) {
- M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
- .addReg(AMDGPU::M0);
- }
- }
-
- unsigned ScalarStoreOp;
unsigned EltSize = 4;
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
- if (SpillToSMEM && isSGPRClass(RC)) {
- // XXX - if private_element_size is larger than 4 it might be useful to be
- // able to spill wider vmem spills.
- std::tie(EltSize, ScalarStoreOp) =
- getSpillEltSize(getRegSizeInBits(*RC) / 8, true);
- }
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+ // Scavenged temporary VGPR to use. It must be scavenged once for any number
+ // of spilled subregs.
+ Register TmpVGPR;
+
// SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- unsigned SubReg = NumSubRegs == 1 ?
- SuperReg : getSubReg(SuperReg, SplitParts[i]);
-
- if (SpillToSMEM) {
- int64_t FrOffset = FrameInfo.getObjectOffset(Index);
-
- // The allocated memory size is really the wavefront size * the frame
- // index size. The widest register class is 64 bytes, so a 4-byte scratch
- // allocation is enough to spill this in a single stack object.
- //
- // FIXME: Frame size/offsets are computed earlier than this, so the extra
- // space is still unnecessarily allocated.
-
- unsigned Align = FrameInfo.getObjectAlignment(Index);
- MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
- MachineMemOperand *MMO
- = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
- EltSize, MinAlign(Align, EltSize * i));
-
- // SMEM instructions only support a single offset, so increment the wave
- // offset.
-
- int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
- if (Offset != 0) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(FrameReg)
- .addImm(Offset);
- } else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addReg(FrameReg);
- }
-
- BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
- .addReg(SubReg, getKillRegState(IsKill)) // sdata
- .addReg(MFI->getScratchRSrcReg()) // sbase
- .addReg(OffsetReg, RegState::Kill) // soff
- .addImm(0) // glc
- .addImm(0) // dlc
- .addMemOperand(MMO);
-
- continue;
- }
+ Register SubReg =
+ NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
@@ -915,15 +817,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
return false;
// Spill SGPR to a frame index.
- // TODO: Should VI try to spill to VGPR and then spill to SMEM?
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // TODO: Should VI try to spill to VGPR and then spill to SMEM?
+ if (!TmpVGPR.isValid())
+ TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
MachineInstrBuilder Mov
- = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(SubReg, SubKillState);
-
// There could be undef components of a spilled super register.
// TODO: Can we detect this and skip the spill?
if (NumSubRegs > 1) {
@@ -941,7 +841,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
EltSize, MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
- .addReg(TmpReg, RegState::Kill) // src
+ .addReg(TmpVGPR, RegState::Kill) // src
.addFrameIndex(Index) // vaddr
.addReg(MFI->getScratchRSrcReg()) // srrsrc
.addReg(MFI->getStackPtrOffsetReg()) // soffset
@@ -965,7 +865,6 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
RegScavenger *RS,
bool OnlyToVGPR) const {
MachineFunction *MF = MI->getParent()->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
@@ -976,84 +875,27 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
return false;
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = MI->getDebugLoc();
- unsigned SuperReg = MI->getOperand(0).getReg();
- bool SpillToSMEM = spillSGPRToSMEM();
- if (SpillToSMEM && OnlyToVGPR)
- return false;
+ Register SuperReg = MI->getOperand(0).getReg();
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
- unsigned OffsetReg = AMDGPU::M0;
unsigned M0CopyReg = AMDGPU::NoRegister;
- if (SpillToSMEM) {
- if (RS->isRegUsed(AMDGPU::M0)) {
- M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
- .addReg(AMDGPU::M0);
- }
- }
-
unsigned EltSize = 4;
- unsigned ScalarLoadOp;
-
- Register FrameReg = getFrameRegister(*MF);
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
- if (SpillToSMEM && isSGPRClass(RC)) {
- // XXX - if private_element_size is larger than 4 it might be useful to be
- // able to spill wider vmem spills.
- std::tie(EltSize, ScalarLoadOp) =
- getSpillEltSize(getRegSizeInBits(*RC) / 8, false);
- }
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
- // SubReg carries the "Kill" flag when SubReg == SuperReg.
- int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+ Register TmpVGPR;
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- unsigned SubReg = NumSubRegs == 1 ?
- SuperReg : getSubReg(SuperReg, SplitParts[i]);
-
- if (SpillToSMEM) {
- // FIXME: Size may be > 4 but extra bytes wasted.
- unsigned Align = FrameInfo.getObjectAlignment(Index);
- MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
- MachineMemOperand *MMO
- = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
- EltSize, MinAlign(Align, EltSize * i));
-
- // Add i * 4 offset
- int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
- if (Offset != 0) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(FrameReg)
- .addImm(Offset);
- } else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addReg(FrameReg);
- }
-
- auto MIB =
- BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
- .addReg(MFI->getScratchRSrcReg()) // sbase
- .addReg(OffsetReg, RegState::Kill) // soff
- .addImm(0) // glc
- .addImm(0) // dlc
- .addMemOperand(MMO);
-
- if (NumSubRegs > 1 && i == 0)
- MIB.addReg(SuperReg, RegState::ImplicitDefine);
-
- continue;
- }
+ Register SubReg =
+ NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
@@ -1071,7 +913,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
// Restore SGPR from a stack slot.
// FIXME: We should use S_LOAD_DWORD here for VI.
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ if (!TmpVGPR.isValid())
+ TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo
@@ -1081,7 +924,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
MachineMemOperand::MOLoad, EltSize,
MinAlign(Align, EltSize * i));
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR)
.addFrameIndex(Index) // vaddr
.addReg(MFI->getScratchRSrcReg()) // srsrc
.addReg(MFI->getStackPtrOffsetReg()) // soffset
@@ -1090,7 +933,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
auto MIB =
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
- .addReg(TmpReg, RegState::Kill);
+ .addReg(TmpVGPR, RegState::Kill);
if (NumSubRegs > 1)
MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
@@ -1141,11 +984,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
MachineFunction *MF = MI->getParent()->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
@@ -1255,13 +1096,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// In an entry function/kernel the offset is already the absolute
// address relative to the frame register.
- unsigned DiffReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register TmpDiffReg =
+ RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+
+ // If there's no free SGPR, in-place modify the FP
+ Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg;
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
Register ResultReg = IsCopy ?
MI->getOperand(0).getReg() :
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
.addReg(FrameReg)
@@ -1271,35 +1115,80 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (Offset == 0) {
// XXX - This never happens because of emergency scavenging slot at 0?
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
- .addImm(Log2_32(ST.getWavefrontSize()))
+ .addImm(ST.getWavefrontSizeLog2())
.addReg(DiffReg);
} else {
- unsigned ScaledReg
- = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
- .addImm(Log2_32(ST.getWavefrontSize()))
- .addReg(DiffReg, RegState::Kill);
-
- // TODO: Fold if use instruction is another add of a constant.
- if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
- TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
- .addImm(Offset)
- .addReg(ScaledReg, RegState::Kill)
- .addImm(0); // clamp bit
+ if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
+ Register ScaledReg =
+ RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0);
+
+ BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+ ScaledReg)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(DiffReg, RegState::Kill);
+
+ const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
+
+ // TODO: Fold if use instruction is another add of a constant.
+ if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+ // FIXME: This can fail
+ MIB.addImm(Offset);
+ MIB.addReg(ScaledReg, RegState::Kill);
+ if (!IsVOP2)
+ MIB.addImm(0); // clamp bit
+ } else {
+ Register ConstOffsetReg =
+ RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false);
+
+ // This should always be able to use the unused carry out.
+ assert(ConstOffsetReg && "this scavenge should not be able to fail");
+
+ BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+ .addImm(Offset);
+ MIB.addReg(ConstOffsetReg, RegState::Kill);
+ MIB.addReg(ScaledReg, RegState::Kill);
+ MIB.addImm(0); // clamp bit
+ }
} else {
- unsigned ConstOffsetReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
- .addImm(Offset);
- TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
- .addReg(ConstOffsetReg, RegState::Kill)
+ // We have to produce a carry out, and we there isn't a free SGPR
+ // pair for it. We can keep the whole computation on the SALU to
+ // avoid clobbering an additional register at the cost of an extra
+ // mov.
+
+ // We may have 1 free scratch SGPR even though a carry out is
+ // unavailable. Only one additional mov is needed.
+ Register TmpScaledReg =
+ RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+ Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg;
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
+ .addReg(DiffReg, RegState::Kill)
+ .addImm(ST.getWavefrontSizeLog2());
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
.addReg(ScaledReg, RegState::Kill)
- .addImm(0); // clamp bit
+ .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
+ .addReg(ScaledReg, RegState::Kill);
+
+ // If there were truly no free SGPRs, we need to undo everything.
+ if (!TmpScaledReg.isValid()) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
+ .addReg(DiffReg, RegState::Kill)
+ .addImm(ST.getWavefrontSizeLog2());
+ }
}
}
+ if (!TmpDiffReg.isValid()) {
+ // Restore the FP.
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg)
+ .addReg(FrameReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ }
+
// Don't introduce an extra copy if we're just materializing in a mov.
if (IsCopy)
MI->eraseFromParent();
@@ -1325,7 +1214,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int64_t NewOffset = OldImm + Offset;
if (isUInt<12>(NewOffset) &&
- buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
+ buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
MI->eraseFromParent();
return;
}
@@ -1337,7 +1226,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int64_t Offset = FrameInfo.getObjectOffset(Index);
FIOp.ChangeToImmediate(Offset);
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
.addImm(Offset);
FIOp.ChangeToRegister(TmpReg, false, false, true);
@@ -1347,27 +1236,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
- const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg);
- unsigned Size = getRegSizeInBits(*RC);
- unsigned AltName = AMDGPU::NoRegAltName;
-
- switch (Size) {
- case 32: AltName = AMDGPU::Reg32; break;
- case 64: AltName = AMDGPU::Reg64; break;
- case 96: AltName = AMDGPU::Reg96; break;
- case 128: AltName = AMDGPU::Reg128; break;
- case 160: AltName = AMDGPU::Reg160; break;
- case 256: AltName = AMDGPU::Reg256; break;
- case 512: AltName = AMDGPU::Reg512; break;
- case 1024: AltName = AMDGPU::Reg1024; break;
- }
- return AMDGPUInstPrinter::getRegisterName(Reg, AltName);
+ return AMDGPUInstPrinter::getRegisterName(Reg);
}
// FIXME: This is very slow. It might be worth creating a map from physreg to
// register class.
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
- assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+ assert(!Register::isVirtualRegister(Reg));
static const TargetRegisterClass *const BaseClasses[] = {
&AMDGPU::VGPR_32RegClass,
@@ -1408,8 +1283,6 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
unsigned Size = getRegSizeInBits(*RC);
- if (Size < 32)
- return false;
switch (Size) {
case 32:
return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
@@ -1427,8 +1300,11 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
case 1024:
return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr;
+ case 1:
+ return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr;
default:
- llvm_unreachable("Invalid register class size");
+ assert(Size < 32 && "Invalid register class size");
+ return false;
}
}
@@ -1476,6 +1352,8 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
return &AMDGPU::VReg_512RegClass;
case 1024:
return &AMDGPU::VReg_1024RegClass;
+ case 1:
+ return &AMDGPU::VReg_1RegClass;
default:
llvm_unreachable("Invalid register class size");
}
@@ -1509,7 +1387,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
case 96:
return &AMDGPU::SReg_96RegClass;
case 128:
- return &AMDGPU::SReg_128RegClass;
+ return &AMDGPU::SGPR_128RegClass;
case 160:
return &AMDGPU::SReg_160RegClass;
case 256:
@@ -1539,7 +1417,7 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
case 3:
return &AMDGPU::SReg_96RegClass;
case 4:
- return &AMDGPU::SReg_128RegClass;
+ return &AMDGPU::SGPR_128RegClass;
case 5:
return &AMDGPU::SReg_160RegClass;
case 8:
@@ -1587,6 +1465,15 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
}
}
+bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
+ if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
+ OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
+ return !ST.hasMFMAInlineLiteralBug();
+
+ return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
+ OpType <= AMDGPU::OPERAND_SRC_LAST;
+}
+
bool SIRegisterInfo::shouldRewriteCopySrc(
const TargetRegisterClass *DefRC,
unsigned DefSubReg,
@@ -1802,7 +1689,7 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC
const TargetRegisterClass*
SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
unsigned Reg) const {
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
return MRI.getRegClass(Reg);
return getPhysRegClass(Reg);
@@ -1845,8 +1732,6 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
@@ -1900,18 +1785,22 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
return isWave32 ?
&AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass;
case AMDGPU::SGPRRegBankID:
- return &AMDGPU::SReg_32_XM0RegClass;
+ return &AMDGPU::SReg_32RegClass;
case AMDGPU::SCCRegBankID:
// This needs to return an allocatable class, so don't bother returning
// the dummy SCC class.
- return &AMDGPU::SReg_32_XM0RegClass;
+ //
+ // FIXME: This is a grotesque hack. We use SGPR_32 as an indication this
+ // was not an VCC bank value since we use the larger class SReg_32 for
+ // other values. These should all use SReg_32.
+ return &AMDGPU::SGPR_32RegClass;
default:
llvm_unreachable("unknown register bank");
}
}
case 32:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
- &AMDGPU::SReg_32_XM0RegClass;
+ &AMDGPU::SReg_32RegClass;
case 64:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
&AMDGPU::SReg_64_XEXECRegClass;
@@ -1920,7 +1809,7 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
&AMDGPU::SReg_96RegClass;
case 128:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
- &AMDGPU::SReg_128RegClass;
+ &AMDGPU::SGPR_128RegClass;
case 160:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass :
&AMDGPU::SReg_160RegClass;
@@ -1930,10 +1819,13 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
case 512:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass :
&AMDGPU::SReg_512RegClass;
+ case 1024:
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass :
+ &AMDGPU::SReg_1024RegClass;
default:
if (Size < 32)
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
- &AMDGPU::SReg_32_XM0RegClass;
+ &AMDGPU::SReg_32RegClass;
return nullptr;
}
}
@@ -1941,9 +1833,12 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
const TargetRegisterClass *
SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
const MachineRegisterInfo &MRI) const {
- if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()))
+ const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
+ if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
- return nullptr;
+
+ const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>();
+ return getAllocatableClass(RC);
}
unsigned SIRegisterInfo::getVCC() const {
@@ -1974,7 +1869,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
SlotIndex UseIdx = LIS->getInstructionIndex(Use);
SlotIndex DefIdx;
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
if (!LIS->hasInterval(Reg))
return nullptr;
LiveInterval &LI = LIS->getInterval(Reg);
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 34487c96e72e..ac3dea1a1a28 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -27,6 +27,7 @@ class SIMachineFunctionInfo;
class SIRegisterInfo final : public AMDGPURegisterInfo {
private:
+ const GCNSubtarget &ST;
unsigned SGPRSetID;
unsigned VGPRSetID;
unsigned AGPRSetID;
@@ -34,7 +35,6 @@ private:
BitVector VGPRPressureSets;
BitVector AGPRPressureSets;
bool SpillSGPRToVGPR;
- bool SpillSGPRToSMEM;
bool isWave32;
void classifyPressureSet(unsigned PSetID, unsigned Reg,
@@ -46,10 +46,6 @@ public:
return SpillSGPRToVGPR;
}
- bool spillSGPRToSMEM() const {
- return SpillSGPRToSMEM;
- }
-
/// Return the end register initially reserved for the scratch buffer in case
/// spilling is needed.
unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
@@ -141,7 +137,7 @@ public:
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
const TargetRegisterClass *RC;
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
RC = MRI.getRegClass(Reg);
else
RC = getPhysRegClass(Reg);
@@ -193,10 +189,7 @@ public:
/// \returns True if operands defined with this operand type can accept
/// an inline constant. i.e. An integer value in the range (-16, 64) or
/// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
- bool opCanUseInlineConstant(unsigned OpType) const {
- return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
- OpType <= AMDGPU::OPERAND_SRC_LAST;
- }
+ bool opCanUseInlineConstant(unsigned OpType) const;
unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC,
@@ -270,7 +263,7 @@ public:
const MachineRegisterInfo &MRI) const override;
const TargetRegisterClass *getBoolRC() const {
- return isWave32 ? &AMDGPU::SReg_32_XM0RegClass
+ return isWave32 ? &AMDGPU::SReg_32RegClass
: &AMDGPU::SReg_64RegClass;
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index d5948a7862cc..82219cbdf3b2 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -37,50 +37,52 @@ class getSubRegs<int size> {
!if(!eq(size, 16), ret16, ret32))))));
}
-let Namespace = "AMDGPU" in {
-defset list<RegAltNameIndex> AllRegAltNameIndices = {
- def Reg32 : RegAltNameIndex;
- def Reg64 : RegAltNameIndex;
- def Reg96 : RegAltNameIndex;
- def Reg128 : RegAltNameIndex;
- def Reg160 : RegAltNameIndex;
- def Reg256 : RegAltNameIndex;
- def Reg512 : RegAltNameIndex;
- def Reg1024 : RegAltNameIndex;
-}
-}
+// Generates list of sequential register tuple names.
+// E.g. RegSeq<3,2,2,"s">.ret -> [ "s[0:1]", "s[2:3]" ]
+class RegSeqNames<int last_reg, int stride, int size, string prefix,
+ int start = 0> {
+ int next = !add(start, stride);
+ int end_reg = !add(!add(start, size), -1);
+ list<string> ret =
+ !if(!le(end_reg, last_reg),
+ !listconcat([prefix # "[" # start # ":" # end_reg # "]"],
+ RegSeqNames<last_reg, stride, size, prefix, next>.ret),
+ []);
+}
+
+// Generates list of dags for register tupless.
+class RegSeqDags<RegisterClass RC, int last_reg, int stride, int size,
+ int start = 0> {
+ dag trunc_rc = (trunc RC,
+ !if(!and(!eq(stride, 1), !eq(start, 0)),
+ !add(!add(last_reg, 2), !mul(size, -1)),
+ !add(last_reg, 1)));
+ list<dag> ret =
+ !if(!lt(start, size),
+ !listconcat([(add (decimate (shl trunc_rc, start), stride))],
+ RegSeqDags<RC, last_reg, stride, size, !add(start, 1)>.ret),
+ []);
+}
+
+class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC,
+ int last_reg, int stride, int size, string prefix> :
+ RegisterTuples<Indices,
+ RegSeqDags<RC, last_reg, stride, size>.ret,
+ RegSeqNames<last_reg, stride, size, prefix>.ret>;
//===----------------------------------------------------------------------===//
// Declarations that describe the SI registers
//===----------------------------------------------------------------------===//
-class SIReg <string n, bits<16> regIdx = 0, string prefix = "",
- int regNo = !cast<int>(regIdx)> :
- Register<n, !if(!eq(prefix, ""),
- [ n, n, n, n, n, n, n, n ],
- [ prefix # regNo,
- prefix # "[" # regNo # ":" # !and(!add(regNo, 1), 255) # "]",
- prefix # "[" # regNo # ":" # !and(!add(regNo, 2), 255) # "]",
- prefix # "[" # regNo # ":" # !and(!add(regNo, 3), 255) # "]",
- prefix # "[" # regNo # ":" # !and(!add(regNo, 4), 255) # "]",
- prefix # "[" # regNo # ":" # !and(!add(regNo, 7), 255) # "]",
- prefix # "[" # regNo # ":" # !and(!add(regNo, 15), 255) # "]",
- prefix # "[" # regNo # ":" # !and(!add(regNo, 31), 255) # "]",
- ])>,
+class SIReg <string n, bits<16> regIdx = 0> :
+ Register<n>,
DwarfRegNum<[!cast<int>(HWEncoding)]> {
let Namespace = "AMDGPU";
- let RegAltNameIndices = AllRegAltNameIndices;
// This is the not yet the complete register encoding. An additional
// bit is set for VGPRs.
let HWEncoding = regIdx;
}
-class SIRegisterWithSubRegs<string n, list<Register> subregs> :
- RegisterWithSubRegs<n, subregs> {
- let RegAltNameIndices = AllRegAltNameIndices;
- let AltNames = [ n, n, n, n, n, n, n, n ];
-}
-
// Special Registers
def VCC_LO : SIReg<"vcc_lo", 106>;
def VCC_HI : SIReg<"vcc_hi", 107>;
@@ -93,7 +95,7 @@ def SP_REG : SIReg<"sp", 0>;
def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>;
// VCC for 64-bit instructions
-def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
DwarfRegAlias<VCC_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -103,7 +105,7 @@ def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
def EXEC_LO : SIReg<"exec_lo", 126>;
def EXEC_HI : SIReg<"exec_hi", 127>;
-def EXEC : SIRegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>,
+def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>,
DwarfRegAlias<EXEC_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -134,7 +136,7 @@ def LDS_DIRECT : SIReg <"src_lds_direct", 254>;
def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>;
def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>;
-def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
+def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
DwarfRegAlias<XNACK_MASK_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -145,7 +147,7 @@ def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_
def TBA_LO : SIReg<"tba_lo", 108>;
def TBA_HI : SIReg<"tba_hi", 109>;
-def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
+def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
DwarfRegAlias<TBA_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -155,7 +157,7 @@ def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
def TMA_LO : SIReg<"tma_lo", 110>;
def TMA_HI : SIReg<"tma_hi", 111>;
-def TMA : SIRegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
+def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
DwarfRegAlias<TMA_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -175,7 +177,7 @@ multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
}
class FlatReg <Register lo, Register hi, bits<16> encoding> :
- SIRegisterWithSubRegs<"flat_scratch", [lo, hi]>,
+ RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
DwarfRegAlias<lo> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -191,19 +193,19 @@ def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
// SGPR registers
foreach Index = 0-105 in {
- def SGPR#Index : SIReg <"SGPR"#Index, Index, "s">;
+ def SGPR#Index : SIReg <"s"#Index, Index>;
}
// VGPR registers
foreach Index = 0-255 in {
- def VGPR#Index : SIReg <"VGPR"#Index, Index, "v"> {
+ def VGPR#Index : SIReg <"v"#Index, Index> {
let HWEncoding{8} = 1;
}
}
// AccVGPR registers
foreach Index = 0-255 in {
- def AGPR#Index : SIReg <"AGPR"#Index, Index, "a"> {
+ def AGPR#Index : SIReg <"a"#Index, Index> {
let HWEncoding{8} = 1;
}
}
@@ -226,102 +228,32 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
// SGPR 32-bit registers
def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add (sequence "SGPR%u", 0, 105)), Reg32> {
+ (add (sequence "SGPR%u", 0, 105))> {
// Give all SGPR classes higher priority than VGPR classes, because
// we want to spill SGPRs to VGPRs.
let AllocationPriority = 9;
}
// SGPR 64-bit registers
-def SGPR_64Regs : RegisterTuples<getSubRegs<2>.ret,
- [(add (decimate SGPR_32, 2)),
- (add (decimate (shl SGPR_32, 1), 2))]>;
+def SGPR_64Regs : SIRegisterTuples<getSubRegs<2>.ret, SGPR_32, 105, 2, 2, "s">;
// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs.
-def SGPR_96Regs : RegisterTuples<getSubRegs<3>.ret,
- [(add (decimate SGPR_32, 3)),
- (add (decimate (shl SGPR_32, 1), 3)),
- (add (decimate (shl SGPR_32, 2), 3))]>;
+def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 3, 3, "s">;
// SGPR 128-bit registers
-def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret,
- [(add (decimate SGPR_32, 4)),
- (add (decimate (shl SGPR_32, 1), 4)),
- (add (decimate (shl SGPR_32, 2), 4)),
- (add (decimate (shl SGPR_32, 3), 4))]>;
+def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">;
// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs.
-def SGPR_160Regs : RegisterTuples<getSubRegs<5>.ret,
- [(add (decimate SGPR_32, 4)),
- (add (decimate (shl SGPR_32, 1), 4)),
- (add (decimate (shl SGPR_32, 2), 4)),
- (add (decimate (shl SGPR_32, 3), 4)),
- (add (decimate (shl SGPR_32, 4), 4))]>;
+def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">;
// SGPR 256-bit registers
-def SGPR_256Regs : RegisterTuples<getSubRegs<8>.ret,
- [(add (decimate SGPR_32, 4)),
- (add (decimate (shl SGPR_32, 1), 4)),
- (add (decimate (shl SGPR_32, 2), 4)),
- (add (decimate (shl SGPR_32, 3), 4)),
- (add (decimate (shl SGPR_32, 4), 4)),
- (add (decimate (shl SGPR_32, 5), 4)),
- (add (decimate (shl SGPR_32, 6), 4)),
- (add (decimate (shl SGPR_32, 7), 4))]>;
+def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">;
// SGPR 512-bit registers
-def SGPR_512Regs : RegisterTuples<getSubRegs<16>.ret,
- [(add (decimate SGPR_32, 4)),
- (add (decimate (shl SGPR_32, 1), 4)),
- (add (decimate (shl SGPR_32, 2), 4)),
- (add (decimate (shl SGPR_32, 3), 4)),
- (add (decimate (shl SGPR_32, 4), 4)),
- (add (decimate (shl SGPR_32, 5), 4)),
- (add (decimate (shl SGPR_32, 6), 4)),
- (add (decimate (shl SGPR_32, 7), 4)),
- (add (decimate (shl SGPR_32, 8), 4)),
- (add (decimate (shl SGPR_32, 9), 4)),
- (add (decimate (shl SGPR_32, 10), 4)),
- (add (decimate (shl SGPR_32, 11), 4)),
- (add (decimate (shl SGPR_32, 12), 4)),
- (add (decimate (shl SGPR_32, 13), 4)),
- (add (decimate (shl SGPR_32, 14), 4)),
- (add (decimate (shl SGPR_32, 15), 4))]>;
+def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s">;
// SGPR 1024-bit registers
-def SGPR_1024Regs : RegisterTuples<getSubRegs<32>.ret,
- [(add (decimate SGPR_32, 4)),
- (add (decimate (shl SGPR_32, 1), 4)),
- (add (decimate (shl SGPR_32, 2), 4)),
- (add (decimate (shl SGPR_32, 3), 4)),
- (add (decimate (shl SGPR_32, 4), 4)),
- (add (decimate (shl SGPR_32, 5), 4)),
- (add (decimate (shl SGPR_32, 6), 4)),
- (add (decimate (shl SGPR_32, 7), 4)),
- (add (decimate (shl SGPR_32, 8), 4)),
- (add (decimate (shl SGPR_32, 9), 4)),
- (add (decimate (shl SGPR_32, 10), 4)),
- (add (decimate (shl SGPR_32, 11), 4)),
- (add (decimate (shl SGPR_32, 12), 4)),
- (add (decimate (shl SGPR_32, 13), 4)),
- (add (decimate (shl SGPR_32, 14), 4)),
- (add (decimate (shl SGPR_32, 15), 4)),
- (add (decimate (shl SGPR_32, 16), 4)),
- (add (decimate (shl SGPR_32, 17), 4)),
- (add (decimate (shl SGPR_32, 18), 4)),
- (add (decimate (shl SGPR_32, 19), 4)),
- (add (decimate (shl SGPR_32, 20), 4)),
- (add (decimate (shl SGPR_32, 21), 4)),
- (add (decimate (shl SGPR_32, 22), 4)),
- (add (decimate (shl SGPR_32, 23), 4)),
- (add (decimate (shl SGPR_32, 24), 4)),
- (add (decimate (shl SGPR_32, 25), 4)),
- (add (decimate (shl SGPR_32, 26), 4)),
- (add (decimate (shl SGPR_32, 27), 4)),
- (add (decimate (shl SGPR_32, 28), 4)),
- (add (decimate (shl SGPR_32, 29), 4)),
- (add (decimate (shl SGPR_32, 30), 4)),
- (add (decimate (shl SGPR_32, 31), 4))]>;
+def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;
// Trap handler TMP 32-bit registers
def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
@@ -330,51 +262,21 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
}
// Trap handler TMP 64-bit registers
-def TTMP_64Regs : RegisterTuples<getSubRegs<2>.ret,
- [(add (decimate TTMP_32, 2)),
- (add (decimate (shl TTMP_32, 1), 2))]>;
+def TTMP_64Regs : SIRegisterTuples<getSubRegs<2>.ret, TTMP_32, 15, 2, 2, "ttmp">;
// Trap handler TMP 128-bit registers
-def TTMP_128Regs : RegisterTuples<getSubRegs<4>.ret,
- [(add (decimate TTMP_32, 4)),
- (add (decimate (shl TTMP_32, 1), 4)),
- (add (decimate (shl TTMP_32, 2), 4)),
- (add (decimate (shl TTMP_32, 3), 4))]>;
-
-def TTMP_256Regs : RegisterTuples<getSubRegs<8>.ret,
- [(add (decimate TTMP_32, 4)),
- (add (decimate (shl TTMP_32, 1), 4)),
- (add (decimate (shl TTMP_32, 2), 4)),
- (add (decimate (shl TTMP_32, 3), 4)),
- (add (decimate (shl TTMP_32, 4), 4)),
- (add (decimate (shl TTMP_32, 5), 4)),
- (add (decimate (shl TTMP_32, 6), 4)),
- (add (decimate (shl TTMP_32, 7), 4))]>;
-
-def TTMP_512Regs : RegisterTuples<getSubRegs<16>.ret,
- [(add (decimate TTMP_32, 4)),
- (add (decimate (shl TTMP_32, 1), 4)),
- (add (decimate (shl TTMP_32, 2), 4)),
- (add (decimate (shl TTMP_32, 3), 4)),
- (add (decimate (shl TTMP_32, 4), 4)),
- (add (decimate (shl TTMP_32, 5), 4)),
- (add (decimate (shl TTMP_32, 6), 4)),
- (add (decimate (shl TTMP_32, 7), 4)),
- (add (decimate (shl TTMP_32, 8), 4)),
- (add (decimate (shl TTMP_32, 9), 4)),
- (add (decimate (shl TTMP_32, 10), 4)),
- (add (decimate (shl TTMP_32, 11), 4)),
- (add (decimate (shl TTMP_32, 12), 4)),
- (add (decimate (shl TTMP_32, 13), 4)),
- (add (decimate (shl TTMP_32, 14), 4)),
- (add (decimate (shl TTMP_32, 15), 4))]>;
+def TTMP_128Regs : SIRegisterTuples<getSubRegs<4>.ret, TTMP_32, 15, 4, 4, "ttmp">;
+
+def TTMP_256Regs : SIRegisterTuples<getSubRegs<8>.ret, TTMP_32, 15, 4, 8, "ttmp">;
+
+def TTMP_512Regs : SIRegisterTuples<getSubRegs<16>.ret, TTMP_32, 15, 4, 16, "ttmp">;
class TmpRegTuplesBase<int index, int size,
list<Register> subRegs,
list<SubRegIndex> indices = getSubRegs<size>.ret,
int index1 = !add(index, !add(size, -1)),
string name = "ttmp["#index#":"#index1#"]"> :
- SIRegisterWithSubRegs<name, subRegs> {
+ RegisterWithSubRegs<name, subRegs> {
let HWEncoding = subRegs[0].HWEncoding;
let SubRegIndices = indices;
}
@@ -448,196 +350,80 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT
TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10,
TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>;
+class RegisterTypes<list<ValueType> reg_types> {
+ list<ValueType> types = reg_types;
+}
+
+def Reg16Types : RegisterTypes<[i16, f16]>;
+def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
+
+
// VGPR 32-bit registers
// i16/f16 only on VI+
-def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add (sequence "VGPR%u", 0, 255)), Reg32> {
+def VGPR_32 : RegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
+ (add (sequence "VGPR%u", 0, 255))> {
let AllocationPriority = 1;
let Size = 32;
}
// VGPR 64-bit registers
-def VGPR_64 : RegisterTuples<getSubRegs<2>.ret,
- [(add (trunc VGPR_32, 255)),
- (add (shl VGPR_32, 1))]>;
+def VGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, VGPR_32, 255, 1, 2, "v">;
// VGPR 96-bit registers
-def VGPR_96 : RegisterTuples<getSubRegs<3>.ret,
- [(add (trunc VGPR_32, 254)),
- (add (shl VGPR_32, 1)),
- (add (shl VGPR_32, 2))]>;
+def VGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, VGPR_32, 255, 1, 3, "v">;
// VGPR 128-bit registers
-def VGPR_128 : RegisterTuples<getSubRegs<4>.ret,
- [(add (trunc VGPR_32, 253)),
- (add (shl VGPR_32, 1)),
- (add (shl VGPR_32, 2)),
- (add (shl VGPR_32, 3))]>;
+def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 255, 1, 4, "v">;
// VGPR 160-bit registers
-def VGPR_160 : RegisterTuples<getSubRegs<5>.ret,
- [(add (trunc VGPR_32, 252)),
- (add (shl VGPR_32, 1)),
- (add (shl VGPR_32, 2)),
- (add (shl VGPR_32, 3)),
- (add (shl VGPR_32, 4))]>;
+def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">;
// VGPR 256-bit registers
-def VGPR_256 : RegisterTuples<getSubRegs<8>.ret,
- [(add (trunc VGPR_32, 249)),
- (add (shl VGPR_32, 1)),
- (add (shl VGPR_32, 2)),
- (add (shl VGPR_32, 3)),
- (add (shl VGPR_32, 4)),
- (add (shl VGPR_32, 5)),
- (add (shl VGPR_32, 6)),
- (add (shl VGPR_32, 7))]>;
+def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
// VGPR 512-bit registers
-def VGPR_512 : RegisterTuples<getSubRegs<16>.ret,
- [(add (trunc VGPR_32, 241)),
- (add (shl VGPR_32, 1)),
- (add (shl VGPR_32, 2)),
- (add (shl VGPR_32, 3)),
- (add (shl VGPR_32, 4)),
- (add (shl VGPR_32, 5)),
- (add (shl VGPR_32, 6)),
- (add (shl VGPR_32, 7)),
- (add (shl VGPR_32, 8)),
- (add (shl VGPR_32, 9)),
- (add (shl VGPR_32, 10)),
- (add (shl VGPR_32, 11)),
- (add (shl VGPR_32, 12)),
- (add (shl VGPR_32, 13)),
- (add (shl VGPR_32, 14)),
- (add (shl VGPR_32, 15))]>;
+def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">;
// VGPR 1024-bit registers
-def VGPR_1024 : RegisterTuples<getSubRegs<32>.ret,
- [(add (trunc VGPR_32, 225)),
- (add (shl VGPR_32, 1)),
- (add (shl VGPR_32, 2)),
- (add (shl VGPR_32, 3)),
- (add (shl VGPR_32, 4)),
- (add (shl VGPR_32, 5)),
- (add (shl VGPR_32, 6)),
- (add (shl VGPR_32, 7)),
- (add (shl VGPR_32, 8)),
- (add (shl VGPR_32, 9)),
- (add (shl VGPR_32, 10)),
- (add (shl VGPR_32, 11)),
- (add (shl VGPR_32, 12)),
- (add (shl VGPR_32, 13)),
- (add (shl VGPR_32, 14)),
- (add (shl VGPR_32, 15)),
- (add (shl VGPR_32, 16)),
- (add (shl VGPR_32, 17)),
- (add (shl VGPR_32, 18)),
- (add (shl VGPR_32, 19)),
- (add (shl VGPR_32, 20)),
- (add (shl VGPR_32, 21)),
- (add (shl VGPR_32, 22)),
- (add (shl VGPR_32, 23)),
- (add (shl VGPR_32, 24)),
- (add (shl VGPR_32, 25)),
- (add (shl VGPR_32, 26)),
- (add (shl VGPR_32, 27)),
- (add (shl VGPR_32, 28)),
- (add (shl VGPR_32, 29)),
- (add (shl VGPR_32, 30)),
- (add (shl VGPR_32, 31))]>;
+def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 255, 1, 32, "v">;
// AccVGPR 32-bit registers
def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add (sequence "AGPR%u", 0, 255)), Reg32> {
+ (add (sequence "AGPR%u", 0, 255))> {
let AllocationPriority = 1;
let Size = 32;
}
// AGPR 64-bit registers
-def AGPR_64 : RegisterTuples<getSubRegs<2>.ret,
- [(add (trunc AGPR_32, 255)),
- (add (shl AGPR_32, 1))]>;
+def AGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, AGPR_32, 255, 1, 2, "a">;
// AGPR 128-bit registers
-def AGPR_128 : RegisterTuples<getSubRegs<4>.ret,
- [(add (trunc AGPR_32, 253)),
- (add (shl AGPR_32, 1)),
- (add (shl AGPR_32, 2)),
- (add (shl AGPR_32, 3))]>;
+def AGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, AGPR_32, 255, 1, 4, "a">;
// AGPR 512-bit registers
-def AGPR_512 : RegisterTuples<getSubRegs<16>.ret,
- [(add (trunc AGPR_32, 241)),
- (add (shl AGPR_32, 1)),
- (add (shl AGPR_32, 2)),
- (add (shl AGPR_32, 3)),
- (add (shl AGPR_32, 4)),
- (add (shl AGPR_32, 5)),
- (add (shl AGPR_32, 6)),
- (add (shl AGPR_32, 7)),
- (add (shl AGPR_32, 8)),
- (add (shl AGPR_32, 9)),
- (add (shl AGPR_32, 10)),
- (add (shl AGPR_32, 11)),
- (add (shl AGPR_32, 12)),
- (add (shl AGPR_32, 13)),
- (add (shl AGPR_32, 14)),
- (add (shl AGPR_32, 15))]>;
+def AGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, AGPR_32, 255, 1, 16, "a">;
// AGPR 1024-bit registers
-def AGPR_1024 : RegisterTuples<getSubRegs<32>.ret,
- [(add (trunc AGPR_32, 225)),
- (add (shl AGPR_32, 1)),
- (add (shl AGPR_32, 2)),
- (add (shl AGPR_32, 3)),
- (add (shl AGPR_32, 4)),
- (add (shl AGPR_32, 5)),
- (add (shl AGPR_32, 6)),
- (add (shl AGPR_32, 7)),
- (add (shl AGPR_32, 8)),
- (add (shl AGPR_32, 9)),
- (add (shl AGPR_32, 10)),
- (add (shl AGPR_32, 11)),
- (add (shl AGPR_32, 12)),
- (add (shl AGPR_32, 13)),
- (add (shl AGPR_32, 14)),
- (add (shl AGPR_32, 15)),
- (add (shl AGPR_32, 16)),
- (add (shl AGPR_32, 17)),
- (add (shl AGPR_32, 18)),
- (add (shl AGPR_32, 19)),
- (add (shl AGPR_32, 20)),
- (add (shl AGPR_32, 21)),
- (add (shl AGPR_32, 22)),
- (add (shl AGPR_32, 23)),
- (add (shl AGPR_32, 24)),
- (add (shl AGPR_32, 25)),
- (add (shl AGPR_32, 26)),
- (add (shl AGPR_32, 27)),
- (add (shl AGPR_32, 28)),
- (add (shl AGPR_32, 29)),
- (add (shl AGPR_32, 30)),
- (add (shl AGPR_32, 31))]>;
+def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
//===----------------------------------------------------------------------===//
// Register classes used as source and destination
//===----------------------------------------------------------------------===//
def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG), Reg32> {
+ (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
}
def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
- (add PRIVATE_RSRC_REG), Reg128> {
+ (add PRIVATE_RSRC_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
}
def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add LDS_DIRECT), Reg32> {
+ (add LDS_DIRECT)> {
let isAllocatable = 0;
let CopyCost = -1;
}
@@ -648,41 +434,40 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
- SRC_VCCZ, SRC_EXECZ, SRC_SCC), Reg32> {
+ SRC_VCCZ, SRC_EXECZ, SRC_SCC)> {
let AllocationPriority = 10;
}
def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
- (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS), Reg32> {
+ (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
let AllocationPriority = 10;
}
def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
- (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI), Reg32> {
+ (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
let AllocationPriority = 10;
}
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
- (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI), Reg32> {
+ (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
let AllocationPriority = 10;
}
def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
- (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS),
- Reg32> {
+ (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
}
def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
- (add SGPR_64Regs), Reg64> {
+ (add SGPR_64Regs)> {
let CopyCost = 1;
let AllocationPriority = 11;
}
// CCR (call clobbered registers) SGPR 64-bit registers
def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
- (add (trunc SGPR_64, 16)), Reg64> {
+ (add (trunc SGPR_64, 16))> {
let CopyCost = SGPR_64.CopyCost;
let AllocationPriority = SGPR_64.AllocationPriority;
}
@@ -693,13 +478,13 @@ def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
}
def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
- (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA), Reg64> {
+ (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
let CopyCost = 1;
let AllocationPriority = 13;
}
def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
- (add SReg_64_XEXEC, EXEC), Reg64> {
+ (add SReg_64_XEXEC, EXEC)> {
let CopyCost = 1;
let AllocationPriority = 13;
}
@@ -722,17 +507,17 @@ let CopyCost = 2 in {
// There are no 3-component scalar instructions, but this is needed
// for symmetry with VGPRs.
def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
- (add SGPR_96Regs), Reg96> {
+ (add SGPR_96Regs)> {
let AllocationPriority = 14;
}
def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
- (add SGPR_96), Reg96> {
+ (add SGPR_96)> {
let AllocationPriority = 14;
}
def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
- (add SGPR_128Regs), Reg128> {
+ (add SGPR_128Regs)> {
let AllocationPriority = 15;
}
@@ -742,8 +527,9 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
}
def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
- (add SGPR_128, TTMP_128), Reg128> {
+ (add SGPR_128, TTMP_128)> {
let AllocationPriority = 15;
+ let isAllocatable = 0;
}
} // End CopyCost = 2
@@ -751,17 +537,16 @@ def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
// There are no 5-component scalar instructions, but this is needed
// for symmetry with VGPRs.
def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
- (add SGPR_160Regs), Reg160> {
+ (add SGPR_160Regs)> {
let AllocationPriority = 16;
}
def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
- (add SGPR_160), Reg160> {
+ (add SGPR_160)> {
let AllocationPriority = 16;
}
-def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs),
- Reg256> {
+def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> {
let AllocationPriority = 17;
}
@@ -770,14 +555,14 @@ def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> {
}
def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
- (add SGPR_256, TTMP_256), Reg256> {
+ (add SGPR_256, TTMP_256)> {
// Requires 4 s_mov_b64 to copy
let CopyCost = 4;
let AllocationPriority = 17;
}
def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
- (add SGPR_512Regs), Reg512> {
+ (add SGPR_512Regs)> {
let AllocationPriority = 18;
}
@@ -787,31 +572,31 @@ def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
}
def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
- (add SGPR_512, TTMP_512), Reg512> {
+ (add SGPR_512, TTMP_512)> {
// Requires 8 s_mov_b64 to copy
let CopyCost = 8;
let AllocationPriority = 18;
}
def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add VGPR_32, LDS_DIRECT_CLASS), Reg32> {
+ (add VGPR_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
}
def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
- (add SGPR_1024Regs), Reg1024> {
+ (add SGPR_1024Regs)> {
let AllocationPriority = 19;
}
def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
- (add SGPR_1024), Reg1024> {
+ (add SGPR_1024)> {
let CopyCost = 16;
let AllocationPriority = 19;
}
// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
- (add VGPR_64), Reg64> {
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], 32,
+ (add VGPR_64)> {
let Size = 64;
// Requires 2 v_mov_b32 to copy
@@ -819,7 +604,7 @@ def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32
let AllocationPriority = 2;
}
-def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96> {
+def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> {
let Size = 96;
// Requires 3 v_mov_b32 to copy
@@ -828,7 +613,7 @@ def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96>
}
def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
- (add VGPR_128), Reg128> {
+ (add VGPR_128)> {
let Size = 128;
// Requires 4 v_mov_b32 to copy
@@ -837,7 +622,7 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
}
def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
- (add VGPR_160), Reg160> {
+ (add VGPR_160)> {
let Size = 160;
// Requires 5 v_mov_b32 to copy
@@ -846,28 +631,28 @@ def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
}
def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
- (add VGPR_256), Reg256> {
+ (add VGPR_256)> {
let Size = 256;
let CopyCost = 8;
let AllocationPriority = 6;
}
def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
- (add VGPR_512), Reg512> {
+ (add VGPR_512)> {
let Size = 512;
let CopyCost = 16;
let AllocationPriority = 7;
}
def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
- (add VGPR_1024), Reg1024> {
+ (add VGPR_1024)> {
let Size = 1024;
let CopyCost = 32;
let AllocationPriority = 8;
}
def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
- (add AGPR_64), Reg64> {
+ (add AGPR_64)> {
let Size = 64;
let CopyCost = 5;
@@ -875,7 +660,7 @@ def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32
}
def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
- (add AGPR_128), Reg128> {
+ (add AGPR_128)> {
let Size = 128;
// Requires 4 v_accvgpr_write and 4 v_accvgpr_read to copy + burn 1 vgpr
@@ -884,40 +669,39 @@ def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
}
def AReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
- (add AGPR_512), Reg512> {
+ (add AGPR_512)> {
let Size = 512;
let CopyCost = 33;
let AllocationPriority = 7;
}
def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
- (add AGPR_1024), Reg1024> {
+ (add AGPR_1024)> {
let Size = 1024;
let CopyCost = 65;
let AllocationPriority = 8;
}
-def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32), Reg32> {
- let Size = 32;
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
+ let Size = 1;
}
def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add VGPR_32, SReg_32, LDS_DIRECT_CLASS), Reg32> {
+ (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
}
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64),
- Reg64> {
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
let isAllocatable = 0;
}
def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add AGPR_32, VGPR_32), Reg32> {
+ (add AGPR_32, VGPR_32)> {
let isAllocatable = 0;
}
def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32,
- (add AReg_64, VReg_64), Reg64> {
+ (add AReg_64, VReg_64)> {
let isAllocatable = 0;
}
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 7ee178149c7a..8afca2cdc325 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -77,8 +77,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
// Try to fold Src0
MachineOperand &Src0 = MI.getOperand(Src0Idx);
if (Src0.isReg()) {
- unsigned Reg = Src0.getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
+ Register Reg = Src0.getReg();
+ if (Register::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
if (Def && Def->isMoveImmediate()) {
MachineOperand &MovSrc = Def->getOperand(1);
@@ -360,8 +360,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
}
if (NewImm != 0) {
- if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
- SrcReg->isReg()) {
+ if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) {
MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
return true;
@@ -394,12 +393,11 @@ static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
if (!MO.isReg())
continue;
- if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
- TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ if (Register::isPhysicalRegister(Reg) &&
+ Register::isPhysicalRegister(MO.getReg())) {
if (TRI.regsOverlap(Reg, MO.getReg()))
return true;
- } else if (MO.getReg() == Reg &&
- TargetRegisterInfo::isVirtualRegister(Reg)) {
+ } else if (MO.getReg() == Reg && Register::isVirtualRegister(Reg)) {
LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
TRI.getSubRegIndexLaneMask(MO.getSubReg());
if (Overlap.any())
@@ -425,7 +423,7 @@ static TargetInstrInfo::RegSubRegPair
getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
- if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ if (Register::isPhysicalRegister(Reg)) {
Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
} else {
LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub);
@@ -459,13 +457,13 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
MovT.getOpcode() == AMDGPU::COPY);
- unsigned T = MovT.getOperand(0).getReg();
+ Register T = MovT.getOperand(0).getReg();
unsigned Tsub = MovT.getOperand(0).getSubReg();
MachineOperand &Xop = MovT.getOperand(1);
if (!Xop.isReg())
return nullptr;
- unsigned X = Xop.getReg();
+ Register X = Xop.getReg();
unsigned Xsub = Xop.getSubReg();
unsigned Size = TII->getOpSize(MovT, 0) / 4;
@@ -484,7 +482,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
MovY.getOperand(1).getSubReg() != Tsub)
continue;
- unsigned Y = MovY.getOperand(0).getReg();
+ Register Y = MovY.getOperand(0).getReg();
unsigned Ysub = MovY.getOperand(0).getSubReg();
if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent())
@@ -579,7 +577,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// XXX - not exactly a check for post-regalloc run.
MachineOperand &Src = MI.getOperand(1);
if (Src.isImm() &&
- TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
+ Register::isPhysicalRegister(MI.getOperand(0).getReg())) {
int32_t ReverseImm;
if (isReverseInlineImm(TII, Src, ReverseImm)) {
MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
@@ -643,8 +641,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// FIXME: This could work better if hints worked with subregisters. If
// we have a vector add of a constant, we usually don't get the correct
// allocation due to the subregister usage.
- if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
- Src0->isReg()) {
+ if (Register::isVirtualRegister(Dest->getReg()) && Src0->isReg()) {
MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
continue;
@@ -672,8 +669,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
const MachineOperand &Dst = MI.getOperand(0);
MachineOperand &Src = MI.getOperand(1);
- if (Src.isImm() &&
- TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
+ if (Src.isImm() && Register::isPhysicalRegister(Dst.getReg())) {
int32_t ReverseImm;
if (isKImmOperand(TII, Src))
MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
@@ -721,8 +717,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
if (TII->isVOPC(Op32)) {
- unsigned DstReg = MI.getOperand(0).getReg();
- if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+ Register DstReg = MI.getOperand(0).getReg();
+ if (Register::isVirtualRegister(DstReg)) {
// VOPC instructions can only write to the VCC register. We can't
// force them to use VCC here, because this is only one register and
// cannot deal with sequences which would require multiple copies of
@@ -745,8 +741,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
TII->getNamedOperand(MI, AMDGPU::OpName::src2);
if (!Src2->isReg())
continue;
- unsigned SReg = Src2->getReg();
- if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+ Register SReg = Src2->getReg();
+ if (Register::isVirtualRegister(SReg)) {
MRI.setRegAllocationHint(SReg, 0, VCCReg);
continue;
}
@@ -766,7 +762,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
bool Next = false;
if (SDst->getReg() != VCCReg) {
- if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
+ if (Register::isVirtualRegister(SDst->getReg()))
MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
Next = true;
}
@@ -774,7 +770,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// All of the instructions with carry outs also have an SGPR input in
// src2.
if (Src2 && Src2->getReg() != VCCReg) {
- if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
+ if (Register::isVirtualRegister(Src2->getReg()))
MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
Next = true;
}
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 4e07efff55d8..cb4cf68d709a 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -273,12 +273,12 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
if (!Use.isReg() || !Use.isUse())
continue;
- unsigned Reg = Use.getReg();
+ Register Reg = Use.getReg();
// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.
- if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (!Register::isVirtualRegister(Reg)) {
if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
continue;
@@ -312,6 +312,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
char GlobalFlags = 0;
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
+ SmallVector<MachineInstr *, 4> SoftWQMInstrs;
// We need to visit the basic blocks in reverse post-order so that we visit
// defs before uses, in particular so that we don't accidentally mark an
@@ -340,6 +341,10 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
// correct, so we need it to be in WQM.
Flags = StateWQM;
LowerToCopyInstrs.push_back(&MI);
+ } else if (Opcode == AMDGPU::SOFT_WQM) {
+ LowerToCopyInstrs.push_back(&MI);
+ SoftWQMInstrs.push_back(&MI);
+ continue;
} else if (Opcode == AMDGPU::WWM) {
// The WWM intrinsic doesn't make the same guarantee, and plus it needs
// to be executed in WQM or Exact so that its copy doesn't clobber
@@ -356,8 +361,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
if (Inactive.isUndef()) {
LowerToCopyInstrs.push_back(&MI);
} else {
- unsigned Reg = Inactive.getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ Register Reg = Inactive.getReg();
+ if (Register::isVirtualRegister(Reg)) {
for (MachineInstr &DefMI : MRI->def_instructions(Reg))
markInstruction(DefMI, StateWWM, Worklist);
}
@@ -385,9 +390,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
- if (!TRI->isVirtualRegister(Reg) &&
+ if (!Register::isVirtualRegister(Reg) &&
TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
Flags = StateWQM;
break;
@@ -407,9 +412,12 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
// Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
// ever used anywhere in the function. This implements the corresponding
// semantics of @llvm.amdgcn.set.inactive.
+ // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
if (GlobalFlags & StateWQM) {
for (MachineInstr *MI : SetInactiveInstrs)
markInstruction(*MI, StateWQM, Worklist);
+ for (MachineInstr *MI : SoftWQMInstrs)
+ markInstruction(*MI, StateWQM, Worklist);
}
return GlobalFlags;
@@ -548,7 +556,7 @@ bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
MachineBasicBlock::iterator
SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before) {
- unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MachineInstr *Save =
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
@@ -832,7 +840,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
for (MachineInstr *MI : LiveMaskQueries) {
const DebugLoc &DL = MI->getDebugLoc();
- unsigned Dest = MI->getOperand(0).getReg();
+ Register Dest = MI->getOperand(0).getReg();
MachineInstr *Copy =
BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
.addReg(LiveMaskReg);
@@ -847,13 +855,12 @@ void SIWholeQuadMode::lowerCopyInstrs() {
for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
MI->RemoveOperand(i);
- const unsigned Reg = MI->getOperand(0).getReg();
+ const Register Reg = MI->getOperand(0).getReg();
if (TRI->isVGPR(*MRI, Reg)) {
- const TargetRegisterClass *regClass =
- TargetRegisterInfo::isVirtualRegister(Reg)
- ? MRI->getRegClass(Reg)
- : TRI->getPhysRegClass(Reg);
+ const TargetRegisterClass *regClass = Register::isVirtualRegister(Reg)
+ ? MRI->getRegClass(Reg)
+ : TRI->getPhysRegClass(Reg);
const unsigned MovOp = TII->getMovOpcode(regClass);
MI->setDesc(TII->get(MovOp));
@@ -885,7 +892,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
if (!(GlobalFlags & StateWQM)) {
lowerLiveMaskQueries(Exec);
- if (!(GlobalFlags & StateWWM))
+ if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty())
return !LiveMaskQueries.empty();
} else {
// Store a copy of the original live mask when required
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 1b410b6b5912..1a74ebbf8165 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -793,9 +793,18 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> {
// selector to prefer those.
let AddedComplexity = 100 in {
-defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
+foreach vt = Reg32Types.types in {
+defm : SMRD_Pattern <"S_LOAD_DWORD", vt>;
+}
+
+foreach vt = SReg_64.RegTypes in {
+defm : SMRD_Pattern <"S_LOAD_DWORDX2", vt>;
+}
+
+foreach vt = SReg_128.RegTypes in {
+defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>;
+}
+
defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index dfafdccc05a3..d31a49f428ee 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -181,7 +181,9 @@ def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
[(set i32:$sdst, (ctpop i32:$src0))]
>;
-def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64">;
+def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64",
+ [(set i32:$sdst, (ctpop i64:$src0))]
+>;
} // End Defs = [SCC]
def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
@@ -417,16 +419,16 @@ def S_SUBB_U32 : SOP2_32 <"s_subb_u32",
let isCommutable = 1 in {
def S_MIN_I32 : SOP2_32 <"s_min_i32",
- [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
>;
def S_MIN_U32 : SOP2_32 <"s_min_u32",
- [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
>;
def S_MAX_I32 : SOP2_32 <"s_max_i32",
- [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
>;
def S_MAX_U32 : SOP2_32 <"s_max_u32",
- [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
>;
} // End isCommutable = 1
} // End Defs = [SCC]
@@ -853,13 +855,13 @@ class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
let Defs = [SCC];
}
class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
- string opName, PatLeaf cond> : SOPC_Base <
+ string opName, SDPatternOperator cond> : SOPC_Base <
op, rc, rc, opName,
[(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > {
}
class SOPC_CMP_32<bits<7> op, string opName,
- PatLeaf cond = COND_NULL, string revOp = opName>
+ SDPatternOperator cond = COND_NULL, string revOp = opName>
: SOPC_Helper<op, SSrc_b32, i32, opName, cond>,
Commutable_REV<revOp, !eq(revOp, opName)>,
SOPKInstTable<0, opName> {
@@ -868,7 +870,7 @@ class SOPC_CMP_32<bits<7> op, string opName,
}
class SOPC_CMP_64<bits<7> op, string opName,
- PatLeaf cond = COND_NULL, string revOp = opName>
+ SDPatternOperator cond = COND_NULL, string revOp = opName>
: SOPC_Helper<op, SSrc_b64, i64, opName, cond>,
Commutable_REV<revOp, !eq(revOp, opName)> {
let isCompare = 1;
@@ -1076,8 +1078,6 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
[(int_amdgcn_s_barrier)]> {
let SchedRW = [WriteBarrier];
let simm16 = 0;
- let mayLoad = 1;
- let mayStore = 1;
let isConvergent = 1;
}
@@ -1090,7 +1090,7 @@ def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> {
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16",
- [(int_amdgcn_s_waitcnt UIMM16bit:$simm16)]>;
+ [(int_amdgcn_s_waitcnt timm:$simm16)]>;
def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
@@ -1099,7 +1099,7 @@ def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the
// maximum really 15 on VI?
def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
- "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> {
+ "s_sleep $simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
@@ -1110,12 +1110,11 @@ def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
let Uses = [EXEC, M0] in {
// FIXME: Should this be mayLoad+mayStore?
def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
- [(AMDGPUsendmsg (i32 imm:$simm16))]
->;
+ [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]>;
def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16",
- [(AMDGPUsendmsghalt (i32 imm:$simm16))]
->;
+ [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]>;
+
} // End Uses = [EXEC, M0]
def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16"> {
@@ -1126,13 +1125,13 @@ def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
let simm16 = 0;
}
def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16",
- [(int_amdgcn_s_incperflevel SIMM16bit:$simm16)]> {
+ [(int_amdgcn_s_incperflevel timm:$simm16)]> {
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
}
def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16",
- [(int_amdgcn_s_decperflevel SIMM16bit:$simm16)]> {
+ [(int_amdgcn_s_decperflevel timm:$simm16)]> {
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
@@ -1169,7 +1168,10 @@ let SubtargetPredicate = isGFX10Plus in {
def S_ROUND_MODE :
SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
def S_DENORM_MODE :
- SOPP<0x025, (ins s16imm:$simm16), "s_denorm_mode $simm16">;
+ SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16",
+ [(SIdenorm_mode (i32 timm:$simm16))]> {
+ let hasSideEffects = 1;
+ }
def S_TTRACEDATA_IMM :
SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">;
} // End SubtargetPredicate = isGFX10Plus
@@ -1178,7 +1180,7 @@ let SubtargetPredicate = isGFX10Plus in {
// S_GETREG_B32 Intrinsic Pattern.
//===----------------------------------------------------------------------===//
def : GCNPat <
- (int_amdgcn_s_getreg imm:$simm16),
+ (int_amdgcn_s_getreg timm:$simm16),
(S_GETREG_B32 (as_i16imm $simm16))
>;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e90f40e6abea..afb2fd987afd 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -131,29 +131,70 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
struct MUBUFInfo {
uint16_t Opcode;
uint16_t BaseOpcode;
- uint8_t dwords;
+ uint8_t elements;
bool has_vaddr;
bool has_srsrc;
bool has_soffset;
};
+struct MTBUFInfo {
+ uint16_t Opcode;
+ uint16_t BaseOpcode;
+ uint8_t elements;
+ bool has_vaddr;
+ bool has_srsrc;
+ bool has_soffset;
+};
+
+#define GET_MTBUFInfoTable_DECL
+#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
#define GET_MUBUFInfoTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
+int getMTBUFBaseOpcode(unsigned Opc) {
+ const MTBUFInfo *Info = getMTBUFInfoFromOpcode(Opc);
+ return Info ? Info->BaseOpcode : -1;
+}
+
+int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements) {
+ const MTBUFInfo *Info = getMTBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements);
+ return Info ? Info->Opcode : -1;
+}
+
+int getMTBUFElements(unsigned Opc) {
+ const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc);
+ return Info ? Info->elements : 0;
+}
+
+bool getMTBUFHasVAddr(unsigned Opc) {
+ const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc);
+ return Info ? Info->has_vaddr : false;
+}
+
+bool getMTBUFHasSrsrc(unsigned Opc) {
+ const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc);
+ return Info ? Info->has_srsrc : false;
+}
+
+bool getMTBUFHasSoffset(unsigned Opc) {
+ const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc);
+ return Info ? Info->has_soffset : false;
+}
+
int getMUBUFBaseOpcode(unsigned Opc) {
const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc);
return Info ? Info->BaseOpcode : -1;
}
-int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords) {
- const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndDwords(BaseOpc, Dwords);
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements) {
+ const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements);
return Info ? Info->Opcode : -1;
}
-int getMUBUFDwords(unsigned Opc) {
+int getMUBUFElements(unsigned Opc) {
const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
- return Info ? Info->dwords : 0;
+ return Info ? Info->elements : 0;
}
bool getMUBUFHasVAddr(unsigned Opc) {
@@ -241,7 +282,7 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
}
unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) {
- return getMaxWavesPerEU() * getEUsPerCU(STI);
+ return getMaxWavesPerEU(STI) * getEUsPerCU(STI);
}
unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
@@ -253,9 +294,11 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
return 1;
}
-unsigned getMaxWavesPerEU() {
+unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
// FIXME: Need to take scratch memory into account.
- return 10;
+ if (!isGFX10(*STI))
+ return 10;
+ return 20;
}
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
@@ -317,7 +360,7 @@ unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
if (Version.Major >= 10)
return 0;
- if (WavesPerEU >= getMaxWavesPerEU())
+ if (WavesPerEU >= getMaxWavesPerEU(STI))
return 0;
unsigned MinNumSGPRs = getTotalNumSGPRs(STI) / (WavesPerEU + 1);
@@ -394,17 +437,19 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
}
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
- return 256;
+ if (!isGFX10(*STI))
+ return 256;
+ return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
}
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
- return getTotalNumVGPRs(STI);
+ return 256;
}
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
assert(WavesPerEU != 0);
- if (WavesPerEU >= getMaxWavesPerEU())
+ if (WavesPerEU >= getMaxWavesPerEU(STI))
return 0;
unsigned MinNumVGPRs =
alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1),
@@ -510,7 +555,7 @@ bool isReadOnlySegment(const GlobalValue *GV) {
}
bool shouldEmitConstantsToTextSection(const Triple &TT) {
- return TT.getOS() != Triple::AMDHSA;
+ return TT.getOS() == Triple::AMDPAL;
}
int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 209ef7eef749..f78dadd447ff 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -94,7 +94,7 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI);
/// \returns Maximum number of waves per execution unit for given subtarget \p
/// STI without any kind of limitation.
-unsigned getMaxWavesPerEU();
+unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI);
/// \returns Maximum number of waves per execution unit for given subtarget \p
/// STI and limited by given \p FlatWorkGroupSize.
@@ -264,13 +264,31 @@ LLVM_READONLY
const MIMGInfo *getMIMGInfo(unsigned Opc);
LLVM_READONLY
+int getMTBUFBaseOpcode(unsigned Opc);
+
+LLVM_READONLY
+int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements);
+
+LLVM_READONLY
+int getMTBUFElements(unsigned Opc);
+
+LLVM_READONLY
+bool getMTBUFHasVAddr(unsigned Opc);
+
+LLVM_READONLY
+bool getMTBUFHasSrsrc(unsigned Opc);
+
+LLVM_READONLY
+bool getMTBUFHasSoffset(unsigned Opc);
+
+LLVM_READONLY
int getMUBUFBaseOpcode(unsigned Opc);
LLVM_READONLY
-int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords);
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements);
LLVM_READONLY
-int getMUBUFDwords(unsigned Opc);
+int getMUBUFElements(unsigned Opc);
LLVM_READONLY
bool getMUBUFHasVAddr(unsigned Opc);
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index db20d5ccf5f9..207e4232e829 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -21,6 +21,8 @@
#include "SIDefines.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/EndianStream.h"
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 6bc416ed7d4b..f1cdc3097dc0 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -104,9 +104,21 @@ multiclass VOP1Inst <string opName, VOPProfile P,
SDPatternOperator node = null_frag> {
def _e32 : VOP1_Pseudo <opName, P>;
def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
- def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+
+ foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+
foreach _ = BoolToList<P.HasExtDPP>.ret in
def _dpp : VOP1_DPP_Pseudo <opName, P>;
+
+ def : MnemonicAlias<opName#"_e32", opName>, LetDummies;
+ def : MnemonicAlias<opName#"_e64", opName>, LetDummies;
+
+ foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ def : MnemonicAlias<opName#"_sdwa", opName>, LetDummies;
+
+ foreach _ = BoolToList<P.HasExtDPP>.ret in
+ def : MnemonicAlias<opName#"_dpp", opName>, LetDummies;
}
// Special profile for instructions which have clamp
@@ -227,10 +239,10 @@ defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
} // End SchedRW = [WriteQuarterRate32]
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
-defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
-defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
+defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>;
+defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>;
defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
-defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
+defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
let SchedRW = [WriteDoubleAdd] in {
defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
@@ -434,7 +446,7 @@ let SubtargetPredicate = isGFX10Plus in {
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
-class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> :
+class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> :
VOP_DPP<ps.OpName, p, isDPP16> {
let hasSideEffects = ps.hasSideEffects;
let Defs = ps.Defs;
@@ -448,8 +460,9 @@ class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 =
let Inst{31-25} = 0x3f;
}
-class VOP1_DPP16<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
- VOP1_DPP<op, ps, p, 1> {
+class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl> :
+ VOP1_DPP<op, ps, p, 1>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10> {
let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
let SubtargetPredicate = HasDPP16;
}
@@ -492,6 +505,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
VOP3e_gfx10<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP1_Real_sdwa_gfx10<bits<9> op> {
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae<op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
@@ -499,11 +513,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP1_Real_dpp_gfx10<bits<9> op> {
- def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")> {
let DecoderNamespace = "SDWA10";
}
}
multiclass VOP1_Real_dpp8_gfx10<bits<9> op> {
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
let DecoderNamespace = "DPP8";
}
@@ -704,10 +720,12 @@ multiclass VOP1_Real_e32e64_vi <bits<10> op> {
multiclass VOP1_Real_vi <bits<10> op> {
defm NAME : VOP1_Real_e32e64_vi <op>;
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -831,25 +849,25 @@ def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
-let OtherPredicates = [isGFX8GFX9] in {
+let OtherPredicates = [isGFX8Plus] in {
def : GCNPat <
- (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
- imm:$bound_ctrl)),
+ (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
+ timm:$bound_ctrl)),
(V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl),
(as_i32imm $row_mask), (as_i32imm $bank_mask),
(as_i1imm $bound_ctrl))
>;
def : GCNPat <
- (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask,
- imm:$bank_mask, imm:$bound_ctrl)),
+ (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, timm:$row_mask,
+ timm:$bank_mask, timm:$bound_ctrl)),
(V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl),
(as_i32imm $row_mask), (as_i32imm $bank_mask),
(as_i1imm $bound_ctrl))
>;
-} // End OtherPredicates = [isGFX8GFX9]
+} // End OtherPredicates = [isGFX8Plus]
let OtherPredicates = [isGFX8Plus] in {
def : GCNPat<
@@ -885,6 +903,7 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
defm NAME : VOP1_Real_e32e64_vi <op>;
}
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -904,23 +923,7 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
let OtherPredicates = [isGFX10Plus] in {
def : GCNPat <
- (i32 (int_amdgcn_mov_dpp8 i32:$src, imm:$dpp8)),
+ (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
(V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0))
>;
-
-def : GCNPat <
- (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
- imm:$bound_ctrl)),
- (V_MOV_B32_dpp_gfx10 $src, $src, (as_i32imm $dpp_ctrl),
- (as_i32imm $row_mask), (as_i32imm $bank_mask),
- (as_i1imm $bound_ctrl), (i32 0))
->;
-
-def : GCNPat <
- (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask,
- imm:$bank_mask, imm:$bound_ctrl)),
- (V_MOV_B32_dpp_gfx10 $old, $src, (as_i32imm $dpp_ctrl),
- (as_i32imm $row_mask), (as_i32imm $bank_mask),
- (as_i1imm $bound_ctrl), (i32 0))
->;
} // End OtherPredicates = [isGFX10Plus]
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 1b30cd2ed516..1ab0fc1ab58d 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -147,7 +147,8 @@ multiclass VOP2Inst_sdwa<string opName,
string revOp = opName,
bit GFX9Renamed = 0> {
let renamedInGFX9 = GFX9Renamed in {
- def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+ foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
} // End renamedInGFX9 = GFX9Renamed
}
@@ -179,9 +180,10 @@ multiclass VOP2bInst <string opName,
let usesCustomInserter = !eq(P.NumSrcArgs, 2);
}
- def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
- let AsmMatchConverter = "cvtSdwaVOP2b";
- }
+ foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
+ let AsmMatchConverter = "cvtSdwaVOP2b";
+ }
foreach _ = BoolToList<P.HasExtDPP>.ret in
def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
@@ -220,9 +222,10 @@ multiclass VOP2eInst <string opName,
def _e32 : VOP2_Pseudo <opName, P>,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
- def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
- let AsmMatchConverter = "cvtSdwaVOP2b";
- }
+ foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
+ let AsmMatchConverter = "cvtSdwaVOP2e";
+ }
foreach _ = BoolToList<P.HasExtDPP>.ret in
def _dpp : VOP2_DPP_Pseudo <opName, P>;
@@ -251,7 +254,9 @@ multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> {
class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
- field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
+ field dag Ins32 = !if(!eq(vt.Size, 32),
+ (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm),
+ (ins VCSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm));
field bit HasExt = 0;
// Hack to stop printing _e64
@@ -519,7 +524,7 @@ def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
} // End isConvergent = 1
defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
-defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, add_ctpop>;
defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
@@ -539,9 +544,9 @@ defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfma
let SubtargetPredicate = isGFX6GFX7GFX10 in {
let isCommutable = 1 in {
defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
-defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>;
-defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>;
-defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32, srl>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32, sra>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32, shl>;
} // End isCommutable = 1
} // End SubtargetPredicate = isGFX6GFX7GFX10
@@ -606,9 +611,9 @@ def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
} // End FPDPRounding = 1
-defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
-defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
-defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
+defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, lshl_rev>;
+defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, lshr_rev>;
+defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>;
let isCommutable = 1 in {
let FPDPRounding = 1 in {
@@ -618,16 +623,16 @@ defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub
defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
} // End FPDPRounding = 1
-defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
-defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
+defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16, add>;
+defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16, sub>;
defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
-defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
+defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>;
defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
-defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
-defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
-defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
-defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>;
+defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16, umax>;
+defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16, smax>;
+defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>;
+defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>;
let Constraints = "$vdst = $src2", DisableEncoding="$src2",
isConvertibleToThreeAddress = 1 in {
@@ -653,16 +658,17 @@ defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
let Constraints = "$vdst = $src2",
DisableEncoding="$src2",
isConvertibleToThreeAddress = 1,
- isCommutable = 1 in {
+ isCommutable = 1,
+ IsDOT = 1 in {
let SubtargetPredicate = HasDot5Insts in
- defm V_DOT2C_F32_F16 : VOP2Inst_e32<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>;
+ defm V_DOT2C_F32_F16 : VOP2Inst<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>;
let SubtargetPredicate = HasDot6Insts in
- defm V_DOT4C_I32_I8 : VOP2Inst_e32<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>;
+ defm V_DOT4C_I32_I8 : VOP2Inst<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>;
let SubtargetPredicate = HasDot4Insts in
- defm V_DOT2C_I32_I16 : VOP2Inst_e32<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>;
+ defm V_DOT2C_I32_I16 : VOP2Inst<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>;
let SubtargetPredicate = HasDot3Insts in
- defm V_DOT8C_I32_I4 : VOP2Inst_e32<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>;
+ defm V_DOT8C_I32_I4 : VOP2Inst<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>;
}
let AddedComplexity = 30 in {
@@ -719,50 +725,17 @@ defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>;
// Note: 16-bit instructions produce a 0 result in the high 16-bits
// on GFX8 and GFX9 and preserve high 16 bits on GFX10+
-def ClearHI16 : OutPatFrag<(ops node:$op),
- (V_AND_B32_e64 $op, (V_MOV_B32_e32 (i32 0xffff)))>;
-
-multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst,
- bit PreservesHI16 = 0> {
-
-def : GCNPat<
- (op i16:$src0, i16:$src1),
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1))
->;
-
-def : GCNPat<
- (i32 (zext (op i16:$src0, i16:$src1))),
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1))
->;
-
-def : GCNPat<
- (i64 (zext (op i16:$src0, i16:$src1))),
- (REG_SEQUENCE VReg_64,
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)),
- sub0,
- (V_MOV_B32_e32 (i32 0)), sub1)
->;
-}
-
-multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst,
- bit PreservesHI16 = 0> {
-
-def : GCNPat<
- (op i16:$src0, i16:$src1),
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0))
->;
+multiclass Arithmetic_i16_0Hi_Pats <SDPatternOperator op, Instruction inst> {
def : GCNPat<
(i32 (zext (op i16:$src0, i16:$src1))),
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0))
+ (inst $src0, $src1)
>;
-
def : GCNPat<
(i64 (zext (op i16:$src0, i16:$src1))),
(REG_SEQUENCE VReg_64,
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)),
- sub0,
+ (inst $src0, $src1), sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
}
@@ -774,53 +747,36 @@ class ZExt_i16_i1_Pat <SDNode ext> : GCNPat <
$src)
>;
-let Predicates = [Has16BitInsts] in {
-
-let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
-defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
-defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
-defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>;
-defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>;
-defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>;
-defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>;
-defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>;
-}
-
-let Predicates = [Has16BitInsts, isGFX10Plus] in {
-defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64, 1>;
-defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64, 1>;
-defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64, 1>;
-}
-
+foreach vt = [i16, v2i16] in {
def : GCNPat <
- (and i16:$src0, i16:$src1),
- (V_AND_B32_e64 $src0, $src1)
+ (and vt:$src0, vt:$src1),
+ (V_AND_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
>;
def : GCNPat <
- (or i16:$src0, i16:$src1),
- (V_OR_B32_e64 $src0, $src1)
+ (or vt:$src0, vt:$src1),
+ (V_OR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
>;
def : GCNPat <
- (xor i16:$src0, i16:$src1),
- (V_XOR_B32_e64 $src0, $src1)
+ (xor vt:$src0, vt:$src1),
+ (V_XOR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
>;
-
-let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
-defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>;
-defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>;
-defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>;
}
-let Predicates = [Has16BitInsts, isGFX10Plus] in {
-defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64, 1>;
-defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64, 1>;
-defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64, 1>;
+let Predicates = [Has16BitInsts] in {
+
+let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
+defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<mul, V_MUL_LO_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<sub, V_SUB_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<smin, V_MIN_I16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<smax, V_MAX_I16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<umin, V_MIN_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<lshl_rev, V_LSHLREV_B16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<lshr_rev, V_LSHRREV_B16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<ashr_rev, V_ASHRREV_I16_e64>;
}
def : ZExt_i16_i1_Pat<zext>;
@@ -847,7 +803,7 @@ def : GCNPat<
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
-class VOP2_DPP<bits<6> op, VOP2_Pseudo ps,
+class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps,
string opName = ps.OpName, VOPProfile p = ps.Pfl,
bit IsDPP16 = 0> :
VOP_DPP<opName, p, IsDPP16> {
@@ -865,13 +821,18 @@ class VOP2_DPP<bits<6> op, VOP2_Pseudo ps,
let Inst{31} = 0x0;
}
-class VOP2_DPP16<bits<6> op, VOP2_Pseudo ps,
+class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
string opName = ps.OpName, VOPProfile p = ps.Pfl> :
VOP2_DPP<op, ps, opName, p, 1> {
let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
let SubtargetPredicate = HasDPP16;
}
+class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
+ string opName = ps.OpName, VOPProfile p = ps.Pfl> :
+ Base_VOP2_DPP16<op, ps, opName, p>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10>;
+
class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
string opName = ps.OpName, VOPProfile p = ps.Pfl> :
VOP_DPP8<ps.OpName, p> {
@@ -924,6 +885,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP2_Real_sdwa_gfx10<bits<6> op> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
@@ -931,11 +893,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP2_Real_dpp_gfx10<bits<6> op> {
- def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
let DecoderNamespace = "SDWA10";
}
}
multiclass VOP2_Real_dpp8_gfx10<bits<6> op> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
let DecoderNamespace = "DPP8";
}
@@ -964,6 +928,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
let DecoderNamespace = "SDWA10" in {
multiclass VOP2_Real_sdwa_gfx10_with_name<bits<6> op, string opName,
string asmName> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
@@ -973,13 +938,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName,
string asmName> {
- def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp")> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP16;
}
}
multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName,
string asmName> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP8;
@@ -989,13 +956,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
} // End DecoderNamespace = "SDWA10"
//===------------------------------ VOP2be ------------------------------===//
- multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> {
+ multiclass VOP2be_Real_e32_gfx10<bits<6> op, string opName, string asmName> {
def _e32_gfx10 :
VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.GFX10>,
VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl> {
VOP2_Pseudo Ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands);
}
+ }
+ multiclass VOP2be_Real_e64_gfx10<bits<6> op, string opName, string asmName> {
def _e64_gfx10 :
VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
VOP3be_gfx10<{0, 1, 0, 0, op{5-0}},
@@ -1003,6 +972,9 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64");
let AsmString = asmName # Ps.AsmOperands;
}
+ }
+ multiclass VOP2be_Real_sdwa_gfx10<bits<6> op, string opName, string asmName> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
@@ -1010,64 +982,76 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands);
let DecoderNamespace = "SDWA10";
}
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
+ def _sdwa_w32_gfx10 :
+ Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+ let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands);
+ let isAsmParserOnly = 1;
+ let DecoderNamespace = "SDWA10";
+ let WaveSizePredicate = isWave32;
+ }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
+ def _sdwa_w64_gfx10 :
+ Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+ let AsmString = asmName # Ps.AsmOperands;
+ let isAsmParserOnly = 1;
+ let DecoderNamespace = "SDWA10";
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ multiclass VOP2be_Real_dpp_gfx10<bits<6> op, string opName, string asmName> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx10 :
- VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
let AsmString = asmName # !subst(", vcc", "", AsmDPP);
let DecoderNamespace = "SDWA10";
}
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_w32_gfx10 :
+ Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_w64_gfx10 :
+ Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ multiclass VOP2be_Real_dpp8_gfx10<bits<6> op, string opName, string asmName> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp8_gfx10 :
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
let DecoderNamespace = "DPP8";
}
-
- let WaveSizePredicate = isWave32 in {
- def _sdwa_w32_gfx10 :
- Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
- VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
- VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
- let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands);
- let isAsmParserOnly = 1;
- let DecoderNamespace = "SDWA10";
- }
- def _dpp_w32_gfx10 :
- VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
- string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
- let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
- let isAsmParserOnly = 1;
- }
- def _dpp8_w32_gfx10 :
- VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
- string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
- let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
- let isAsmParserOnly = 1;
- }
- } // End WaveSizePredicate = isWave32
-
- let WaveSizePredicate = isWave64 in {
- def _sdwa_w64_gfx10 :
- Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
- VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
- VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
- let AsmString = asmName # Ps.AsmOperands;
- let isAsmParserOnly = 1;
- let DecoderNamespace = "SDWA10";
- }
- def _dpp_w64_gfx10 :
- VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
- string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
- let AsmString = asmName # AsmDPP;
- let isAsmParserOnly = 1;
- }
- def _dpp8_w64_gfx10 :
- VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
- string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
- let AsmString = asmName # AsmDPP8;
- let isAsmParserOnly = 1;
- }
- } // End WaveSizePredicate = isWave64
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp8_w32_gfx10 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp8_w64_gfx10 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
}
//===----------------------------- VOP3Only -----------------------------===//
@@ -1088,8 +1072,19 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
-multiclass Base_VOP2_Real_gfx10<bits<6> op> :
- VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>;
+multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> :
+ VOP2be_Real_e32_gfx10<op, opName, asmName>,
+ VOP2be_Real_e64_gfx10<op, opName, asmName>,
+ VOP2be_Real_sdwa_gfx10<op, opName, asmName>,
+ VOP2be_Real_dpp_gfx10<op, opName, asmName>,
+ VOP2be_Real_dpp8_gfx10<op, opName, asmName>;
+
+multiclass VOP2e_Real_gfx10<bits<6> op, string opName, string asmName> :
+ VOP2_Real_e32_gfx10<op>,
+ VOP2_Real_e64_gfx10<op>,
+ VOP2be_Real_sdwa_gfx10<op, opName, asmName>,
+ VOP2be_Real_dpp_gfx10<op, opName, asmName>,
+ VOP2be_Real_dpp8_gfx10<op, opName, asmName>;
multiclass VOP2_Real_gfx10<bits<6> op> :
VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>,
@@ -1103,7 +1098,6 @@ multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName,
VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>,
VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>;
-defm V_CNDMASK_B32 : Base_VOP2_Real_gfx10<0x001>;
defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>;
defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>;
defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>;
@@ -1136,6 +1130,9 @@ defm V_SUB_CO_CI_U32 :
defm V_SUBREV_CO_CI_U32 :
VOP2be_Real_gfx10<0x02a, "V_SUBBREV_U32", "v_subrev_co_ci_u32">;
+defm V_CNDMASK_B32 :
+ VOP2e_Real_gfx10<0x001, "V_CNDMASK_B32", "v_cndmask_b32">;
+
// VOP3 only.
defm V_BFM_B32 : VOP3Only_Real_gfx10<0x363>;
defm V_BCNT_U32_B32 : VOP3Only_Real_gfx10<0x364>;
@@ -1322,12 +1319,14 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
} // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8"
multiclass VOP2_SDWA_Real <bits<6> op> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
}
multiclass VOP2_SDWA9_Real <bits<6> op> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -1350,12 +1349,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
let AsmString = AsmName # ps.AsmOperands;
let DecoderNamespace = "GFX8";
}
- def _sdwa_vi :
- VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
- VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
- VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
- let AsmString = AsmName # ps.AsmOperands;
- }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA>.ret in
+ def _sdwa_vi :
+ VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
+ VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
+ let AsmString = AsmName # ps.AsmOperands;
+ }
foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_vi :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>,
@@ -1383,12 +1383,13 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
let AsmString = AsmName # ps.AsmOperands;
let DecoderNamespace = "GFX9";
}
- def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
- VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
- VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
- let AsmString = AsmName # ps.AsmOperands;
- }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9>.ret in
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
+ VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
+ let AsmString = AsmName # ps.AsmOperands;
+ }
foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>,
@@ -1410,10 +1411,11 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
let DecoderNamespace = "GFX9";
}
- def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
- VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
- }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+ }
foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
@@ -1554,7 +1556,7 @@ defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
} // End SubtargetPredicate = HasDLInsts
multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {
- def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+ def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
multiclass VOP2_Real_DOT_ACC_gfx10<bits<6> op> :
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 21dbef9240e1..605425972b1c 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -112,7 +112,7 @@ class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> {
class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
- imm:$cbsz, imm:$abid, imm:$blgp))];
+ timm:$cbsz, timm:$abid, timm:$blgp))];
}
class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
@@ -385,12 +385,12 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
}
let SchedRW = [Write64Bit] in {
-let SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10] in {
-def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, shl>;
-def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, srl>;
-def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, sra>;
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
+def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
+def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-} // End SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10]
+} // End SubtargetPredicate = isGFX6GFX7GFX10
let SubtargetPredicate = isGFX8Plus in {
def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
@@ -399,21 +399,6 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, as
} // End SubtargetPredicate = isGFX8Plus
} // End SchedRW = [Write64Bit]
-let Predicates = [isGFX8Plus] in {
-def : GCNPat <
- (getDivergentFrag<shl>.ret i64:$x, i32:$y),
- (V_LSHLREV_B64 $y, $x)
->;
-def : AMDGPUPat <
- (getDivergentFrag<srl>.ret i64:$x, i32:$y),
- (V_LSHRREV_B64 $y, $x)
->;
-def : AMDGPUPat <
- (getDivergentFrag<sra>.ret i64:$x, i32:$y),
- (V_ASHRREV_I64 $y, $x)
->;
-}
-
let SchedRW = [Write32Bit] in {
let SubtargetPredicate = isGFX8Plus in {
@@ -468,13 +453,13 @@ let FPDPRounding = 1 in {
def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
let Uses = [M0, EXEC] in {
def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
- [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan),
- (i32 imm:$attr),
- (i32 imm:$src0_modifiers),
+ [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 timm:$attrchan),
+ (i32 timm:$attr),
+ (i32 timm:$src0_modifiers),
(f32 VRegSrc_32:$src2),
- (i32 imm:$src2_modifiers),
- (i1 imm:$high),
- (i1 imm:$clamp)))]>;
+ (i32 timm:$src2_modifiers),
+ (i1 timm:$high),
+ (i1 timm:$clamp)))]>;
} // End Uses = [M0, EXEC]
} // End FPDPRounding = 1
} // End renamedInGFX9 = 1
@@ -493,21 +478,21 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f1
let Uses = [M0, EXEC], FPDPRounding = 1 in {
def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
- [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan),
- (i32 imm:$attr),
- (i32 imm:$src0_modifiers),
- (i1 imm:$high),
- (i1 imm:$clamp),
- (i32 imm:$omod)))]>;
+ [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 timm:$attrchan),
+ (i32 timm:$attr),
+ (i32 timm:$src0_modifiers),
+ (i1 timm:$high),
+ (i1 timm:$clamp),
+ (i32 timm:$omod)))]>;
def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>,
- [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan),
- (i32 imm:$attr),
- (i32 imm:$src0_modifiers),
+ [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 timm:$attrchan),
+ (i32 timm:$attr),
+ (i32 timm:$src0_modifiers),
(f32 VRegSrc_32:$src2),
- (i32 imm:$src2_modifiers),
- (i1 imm:$high),
- (i1 imm:$clamp),
- (i32 imm:$omod)))]>;
+ (i32 timm:$src2_modifiers),
+ (i1 timm:$high),
+ (i1 timm:$clamp),
+ (i32 timm:$omod)))]>;
} // End Uses = [M0, EXEC], FPDPRounding = 1
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
@@ -657,11 +642,11 @@ let SubtargetPredicate = isGFX10Plus in {
} // End $vdst = $vdst_in, DisableEncoding $vdst_in
def : GCNPat<
- (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+ (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc),
(V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
>;
def : GCNPat<
- (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+ (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc),
(V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
>;
} // End SubtargetPredicate = isGFX10Plus
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index 55ee5f6577cf..0c13f39fec02 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -261,6 +261,7 @@ class SDot2Pat<Instruction Inst> : GCNPat <
let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate;
}
+let IsDOT = 1 in {
let SubtargetPredicate = HasDot2Insts in {
def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
@@ -277,6 +278,7 @@ def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32
def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
} // End SubtargetPredicate = HasDot1Insts
+} // End let IsDOT = 1
multiclass DotPats<SDPatternOperator dot_op,
VOP3PInst dot_inst> {
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index b3513e383d10..8ef0ec7b71f4 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -183,7 +183,7 @@ multiclass VOPCXInstAliases <string OpName, string Arch> {
}
-class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies {
+class getVOPCPat64 <SDPatternOperator cond, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set i1:$sdst,
(setcc (P.Src0VT
@@ -202,7 +202,7 @@ class VCMPXNoSDstTable <bit has_sdst, string Name> {
multiclass VOPC_Pseudos <string opName,
VOPC_Profile P,
- PatLeaf cond = COND_NULL,
+ SDPatternOperator cond = COND_NULL,
string revOp = opName,
bit DefExec = 0> {
@@ -225,6 +225,7 @@ multiclass VOPC_Pseudos <string opName,
let isCommutable = 1;
}
+ foreach _ = BoolToList<P.HasExtSDWA>.ret in
def _sdwa : VOPC_SDWA_Pseudo <opName, P> {
let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
let SchedRW = P.Schedule;
@@ -236,7 +237,7 @@ multiclass VOPC_Pseudos <string opName,
let SubtargetPredicate = HasSdstCMPX in {
multiclass VOPCX_Pseudos <string opName,
VOPC_Profile P, VOPC_Profile P_NoSDst,
- PatLeaf cond = COND_NULL,
+ SDPatternOperator cond = COND_NULL,
string revOp = opName> :
VOPC_Pseudos <opName, P, cond, revOp, 1> {
@@ -261,6 +262,7 @@ multiclass VOPCX_Pseudos <string opName,
let SubtargetPredicate = HasNoSdstCMPX;
}
+ foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> {
let Defs = [EXEC];
let SchedRW = P_NoSDst.Schedule;
@@ -285,22 +287,23 @@ def VOPC_I16_I16 : VOPC_NoSdst_Profile<[Write32Bit], i16>;
def VOPC_I32_I32 : VOPC_NoSdst_Profile<[Write32Bit], i32>;
def VOPC_I64_I64 : VOPC_NoSdst_Profile<[Write64Bit], i64>;
-multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_F16 <string opName, SDPatternOperator cond = COND_NULL,
+ string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>;
-multiclass VOPC_F32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_F32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>;
-multiclass VOPC_F64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>;
-multiclass VOPC_I16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_I16_I16, cond, revOp, 0>;
-multiclass VOPC_I32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
-multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
multiclass VOPCX_F16 <string opName, string revOp = opName> :
@@ -669,6 +672,7 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
let SchedRW = p.Schedule;
}
+ foreach _ = BoolToList<p.HasExtSDWA>.ret in
def _sdwa : VOPC_SDWA_Pseudo <opName, p> {
let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]),
!if(DefVcc, [VCC], []));
@@ -698,6 +702,7 @@ multiclass VOPCX_Class_Pseudos <string opName,
let SubtargetPredicate = HasNoSdstCMPX;
}
+ foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> {
let Defs = [EXEC];
let SchedRW = P_NoSDst.Schedule;
@@ -737,8 +742,11 @@ defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">;
defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">;
defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">;
defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">;
+
+let SubtargetPredicate = Has16BitInsts in {
defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">;
defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
+}
//===----------------------------------------------------------------------===//
// V_ICMPIntrinsic Pattern.
@@ -878,6 +886,7 @@ let AssemblerPredicate = isGFX10Plus in {
}
} // End DecoderNamespace = "GFX10"
+ foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -903,6 +912,7 @@ let AssemblerPredicate = isGFX10Plus in {
}
} // End DecoderNamespace = "GFX10"
+ foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa")>,
VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Pfl> {
@@ -1223,10 +1233,12 @@ multiclass VOPC_Real_vi <bits<10> op> {
}
}
+ foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+ foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 677095a354be..f208a1134a5a 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -14,6 +14,7 @@ class LetDummies {
bit isReMaterializable;
bit isAsCheapAsAMove;
bit VOPAsmPrefer32Bit;
+ bit FPDPRounding;
Predicate SubtargetPredicate;
string Constraints;
string DisableEncoding;
@@ -41,9 +42,7 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
string asm, list<dag> pattern> :
InstSI <outs, ins, asm, pattern>,
VOP <opName>,
- SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
- MnemonicAlias<opName#suffix, opName> {
-
+ SIMCInstr <opName#suffix, SIEncodingFamily.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;
let UseNamedOperandTable = 1;
@@ -148,6 +147,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
+ let OtherPredicates = ps.OtherPredicates;
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
let Constraints = ps.Constraints;
@@ -473,8 +473,7 @@ class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> {
class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
InstSI <P.OutsSDWA, P.InsSDWA, "", pattern>,
VOP <opName>,
- SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>,
- MnemonicAlias <opName#"_sdwa", opName> {
+ SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;
@@ -595,8 +594,7 @@ class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 {
class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>,
VOP <OpName>,
- SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>,
- MnemonicAlias <OpName#"_dpp", OpName> {
+ SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;
diff --git a/lib/Target/ARC/ARCFrameLowering.h b/lib/Target/ARC/ARCFrameLowering.h
index 41b559d16761..9242400fb28d 100644
--- a/lib/Target/ARC/ARCFrameLowering.h
+++ b/lib/Target/ARC/ARCFrameLowering.h
@@ -27,8 +27,8 @@ class ARCInstrInfo;
class ARCFrameLowering : public TargetFrameLowering {
public:
ARCFrameLowering(const ARCSubtarget &st)
- : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0), ST(st) {
- }
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(4), 0),
+ ST(st) {}
/// Insert Prologue into the function.
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
diff --git a/lib/Target/ARC/ARCISelLowering.cpp b/lib/Target/ARC/ARCISelLowering.cpp
index 847d23f0abdb..751fd567bae8 100644
--- a/lib/Target/ARC/ARCISelLowering.cpp
+++ b/lib/Target/ARC/ARCISelLowering.cpp
@@ -716,7 +716,7 @@ SDValue ARCTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
assert(cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0 &&
"Only support lowering frame addr of current frame.");
- unsigned FrameReg = ARI.getFrameRegister(MF);
+ Register FrameReg = ARI.getFrameRegister(MF);
return DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
}
diff --git a/lib/Target/ARC/ARCMachineFunctionInfo.h b/lib/Target/ARC/ARCMachineFunctionInfo.h
index 31aa5b93246c..d4dcf9bf285c 100644
--- a/lib/Target/ARC/ARCMachineFunctionInfo.h
+++ b/lib/Target/ARC/ARCMachineFunctionInfo.h
@@ -34,8 +34,8 @@ public:
explicit ARCFunctionInfo(MachineFunction &MF)
: ReturnStackOffsetSet(false), VarArgsFrameIndex(0),
ReturnStackOffset(-1U), MaxCallStackReq(0) {
- // Functions are 4-byte (2**2) aligned.
- MF.setAlignment(2);
+ // Functions are 4-byte aligned.
+ MF.setAlignment(Align(4));
}
~ARCFunctionInfo() {}
diff --git a/lib/Target/ARC/ARCOptAddrMode.cpp b/lib/Target/ARC/ARCOptAddrMode.cpp
index c922b99c57b0..22a3b9111c8e 100644
--- a/lib/Target/ARC/ARCOptAddrMode.cpp
+++ b/lib/Target/ARC/ARCOptAddrMode.cpp
@@ -139,8 +139,7 @@ static bool dominatesAllUsesOf(const MachineInstr *MI, unsigned VReg,
MachineDominatorTree *MDT,
MachineRegisterInfo *MRI) {
- assert(TargetRegisterInfo::isVirtualRegister(VReg) &&
- "Expected virtual register!");
+ assert(Register::isVirtualRegister(VReg) && "Expected virtual register!");
for (auto it = MRI->use_nodbg_begin(VReg), end = MRI->use_nodbg_end();
it != end; ++it) {
@@ -181,7 +180,7 @@ static bool isLoadStoreThatCanHandleDisplacement(const TargetInstrInfo *TII,
bool ARCOptAddrMode::noUseOfAddBeforeLoadOrStore(const MachineInstr *Add,
const MachineInstr *Ldst) {
- unsigned R = Add->getOperand(0).getReg();
+ Register R = Add->getOperand(0).getReg();
return dominatesAllUsesOf(Ldst, R, MDT, MRI);
}
@@ -205,9 +204,8 @@ MachineInstr *ARCOptAddrMode::tryToCombine(MachineInstr &Ldst) {
return nullptr;
}
- unsigned B = Base.getReg();
- if (TargetRegisterInfo::isStackSlot(B) ||
- !TargetRegisterInfo::isVirtualRegister(B)) {
+ Register B = Base.getReg();
+ if (Register::isStackSlot(B) || !Register::isVirtualRegister(B)) {
LLVM_DEBUG(dbgs() << "[ABAW] Base is not VReg\n");
return nullptr;
}
@@ -285,7 +283,7 @@ ARCOptAddrMode::canJoinInstructions(MachineInstr *Ldst, MachineInstr *Add,
return nullptr;
}
- unsigned BaseReg = Ldst->getOperand(BasePos).getReg();
+ Register BaseReg = Ldst->getOperand(BasePos).getReg();
// prohibit this:
// v1 = add v0, c
@@ -294,7 +292,7 @@ ARCOptAddrMode::canJoinInstructions(MachineInstr *Ldst, MachineInstr *Add,
// st v0, [v0, 0]
// v1 = add v0, c
if (Ldst->mayStore() && Ldst->getOperand(0).isReg()) {
- unsigned StReg = Ldst->getOperand(0).getReg();
+ Register StReg = Ldst->getOperand(0).getReg();
if (Add->getOperand(0).getReg() == StReg || BaseReg == StReg) {
LLVM_DEBUG(dbgs() << "[canJoinInstructions] Store uses result of Add\n");
return nullptr;
@@ -447,7 +445,7 @@ void ARCOptAddrMode::changeToAddrMode(MachineInstr &Ldst, unsigned NewOpcode,
MachineOperand Src = MachineOperand::CreateImm(0xDEADBEEF);
AII->getBaseAndOffsetPosition(Ldst, BasePos, OffPos);
- unsigned BaseReg = Ldst.getOperand(BasePos).getReg();
+ Register BaseReg = Ldst.getOperand(BasePos).getReg();
Ldst.RemoveOperand(OffPos);
Ldst.RemoveOperand(BasePos);
diff --git a/lib/Target/ARC/ARCRegisterInfo.cpp b/lib/Target/ARC/ARCRegisterInfo.cpp
index 9c8340ac8f81..a7f89b385ffe 100644
--- a/lib/Target/ARC/ARCRegisterInfo.cpp
+++ b/lib/Target/ARC/ARCRegisterInfo.cpp
@@ -206,7 +206,7 @@ void ARCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
LLVM_DEBUG(dbgs() << "Offset : " << Offset << "\n"
<< "<--------->\n");
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
assert(ARC::GPR32RegClass.contains(Reg) && "Unexpected register operand");
if (!TFI->hasFP(MF)) {
diff --git a/lib/Target/ARC/ARCTargetMachine.cpp b/lib/Target/ARC/ARCTargetMachine.cpp
index 9fb45d686c26..34700dc22c54 100644
--- a/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/lib/Target/ARC/ARCTargetMachine.cpp
@@ -38,7 +38,7 @@ ARCTargetMachine::ARCTargetMachine(const Target &T, const Triple &TT,
"f32:32:32-i64:32-f64:32-a:0:32-n32",
TT, CPU, FS, Options, getRelocModel(RM),
getEffectiveCodeModel(CM, CodeModel::Small), OL),
- TLOF(make_unique<TargetLoweringObjectFileELF>()),
+ TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
Subtarget(TT, CPU, FS, *this) {
initAsmInfo();
}
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index fb238bfc9cbc..30b9c8071ba2 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -133,9 +133,9 @@ bool A15SDOptimizer::usesRegClass(MachineOperand &MO,
const TargetRegisterClass *TRC) {
if (!MO.isReg())
return false;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
return MRI->getRegClass(Reg)->hasSuperClassEq(TRC);
else
return TRC->contains(Reg);
@@ -151,7 +151,7 @@ unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) {
// Get the subreg type that is most likely to be coalesced
// for an SPR register that will be used in VDUP32d pseudo.
unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
- if (!TRI->isVirtualRegister(SReg))
+ if (!Register::isVirtualRegister(SReg))
return getDPRLaneFromSPR(SReg);
MachineInstr *MI = MRI->getVRegDef(SReg);
@@ -166,7 +166,7 @@ unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
SReg = MI->getOperand(1).getReg();
}
- if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+ if (Register::isVirtualRegister(SReg)) {
if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1;
return ARM::ssub_0;
}
@@ -191,8 +191,8 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
for (MachineOperand &MO : MI->operands()) {
if ((!MO.isReg()) || (!MO.isUse()))
continue;
- unsigned Reg = MO.getReg();
- if (!TRI->isVirtualRegister(Reg))
+ Register Reg = MO.getReg();
+ if (!Register::isVirtualRegister(Reg))
continue;
MachineOperand *Op = MI->findRegisterDefOperand(Reg);
@@ -213,8 +213,8 @@ void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
for (MachineOperand &MODef : Def->operands()) {
if ((!MODef.isReg()) || (!MODef.isDef()))
continue;
- unsigned DefReg = MODef.getReg();
- if (!TRI->isVirtualRegister(DefReg)) {
+ Register DefReg = MODef.getReg();
+ if (!Register::isVirtualRegister(DefReg)) {
IsDead = false;
break;
}
@@ -245,10 +245,10 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
}
if (MI->isInsertSubreg()) {
- unsigned DPRReg = MI->getOperand(1).getReg();
- unsigned SPRReg = MI->getOperand(2).getReg();
+ Register DPRReg = MI->getOperand(1).getReg();
+ Register SPRReg = MI->getOperand(2).getReg();
- if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) {
+ if (Register::isVirtualRegister(DPRReg) && Register::isVirtualRegister(SPRReg)) {
MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg());
MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg());
@@ -267,7 +267,7 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
// Find the thing we're subreg copying out of - is it of the same
// regclass as DPRMI? (i.e. a DPR or QPR).
- unsigned FullReg = SPRMI->getOperand(1).getReg();
+ Register FullReg = SPRMI->getOperand(1).getReg();
const TargetRegisterClass *TRC =
MRI->getRegClass(MI->getOperand(1).getReg());
if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
@@ -296,9 +296,9 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
if (!MI->getOperand(I).isReg())
continue;
++NumTotal;
- unsigned OpReg = MI->getOperand(I).getReg();
+ Register OpReg = MI->getOperand(I).getReg();
- if (!TRI->isVirtualRegister(OpReg))
+ if (!Register::isVirtualRegister(OpReg))
break;
MachineInstr *Def = MRI->getVRegDef(OpReg);
@@ -342,7 +342,7 @@ bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) {
MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
if (!MI->isFullCopy())
return MI;
- if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+ if (!Register::isVirtualRegister(MI->getOperand(1).getReg()))
return nullptr;
MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
if (!Def)
@@ -369,8 +369,8 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
Reached.insert(MI);
if (MI->isPHI()) {
for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
- unsigned Reg = MI->getOperand(I).getReg();
- if (!TRI->isVirtualRegister(Reg)) {
+ Register Reg = MI->getOperand(I).getReg();
+ if (!Register::isVirtualRegister(Reg)) {
continue;
}
MachineInstr *NewMI = MRI->getVRegDef(Reg);
@@ -379,7 +379,7 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
Front.push_back(NewMI);
}
} else if (MI->isFullCopy()) {
- if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+ if (!Register::isVirtualRegister(MI->getOperand(1).getReg()))
continue;
MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg());
if (!NewMI)
@@ -418,8 +418,8 @@ unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL, unsigned Reg,
unsigned Lane, bool QPR) {
- unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
- &ARM::DPRRegClass);
+ Register Out =
+ MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass : &ARM::DPRRegClass);
BuildMI(MBB, InsertBefore, DL,
TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d), Out)
.addReg(Reg)
@@ -434,7 +434,7 @@ unsigned A15SDOptimizer::createExtractSubreg(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL, unsigned DReg, unsigned Lane,
const TargetRegisterClass *TRC) {
- unsigned Out = MRI->createVirtualRegister(TRC);
+ Register Out = MRI->createVirtualRegister(TRC);
BuildMI(MBB,
InsertBefore,
DL,
@@ -448,7 +448,7 @@ unsigned A15SDOptimizer::createExtractSubreg(
unsigned A15SDOptimizer::createRegSequence(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL, unsigned Reg1, unsigned Reg2) {
- unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
+ Register Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
BuildMI(MBB,
InsertBefore,
DL,
@@ -466,7 +466,7 @@ unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL, unsigned Ssub0,
unsigned Ssub1) {
- unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+ Register Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
BuildMI(MBB, InsertBefore, DL, TII->get(ARM::VEXTd32), Out)
.addReg(Ssub0)
.addReg(Ssub1)
@@ -478,7 +478,7 @@ unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
unsigned A15SDOptimizer::createInsertSubreg(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) {
- unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
+ Register Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
BuildMI(MBB,
InsertBefore,
DL,
@@ -494,7 +494,7 @@ unsigned
A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertBefore,
const DebugLoc &DL) {
- unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+ Register Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
BuildMI(MBB,
InsertBefore,
DL,
@@ -602,7 +602,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
// we can end up with multiple defs of this DPR.
SmallVector<MachineInstr *, 8> DefSrcs;
- if (!TRI->isVirtualRegister(*I))
+ if (!Register::isVirtualRegister(*I))
continue;
MachineInstr *Def = MRI->getVRegDef(*I);
if (!Def)
@@ -622,7 +622,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
// Collect all the uses of this MI's DPR def for updating later.
SmallVector<MachineOperand*, 8> Uses;
- unsigned DPRDefReg = MI->getOperand(0).getReg();
+ Register DPRDefReg = MI->getOperand(0).getReg();
for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
E = MRI->use_end(); I != E; ++I)
Uses.push_back(&*I);
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index bf8ed6562fe7..2e6f756d522c 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -35,6 +35,7 @@ class MachineInstr;
class MCInst;
class PassRegistry;
+Pass *createMVETailPredicationPass();
FunctionPass *createARMLowOverheadLoopsPass();
Pass *createARMParallelDSPPass();
FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
@@ -67,6 +68,7 @@ void initializeThumb2SizeReducePass(PassRegistry &);
void initializeThumb2ITBlockPass(PassRegistry &);
void initializeMVEVPTBlockPass(PassRegistry &);
void initializeARMLowOverheadLoopsPass(PassRegistry &);
+void initializeMVETailPredicationPass(PassRegistry &);
} // end namespace llvm
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index b687db12eaf5..fed4cb2b9316 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -57,12 +57,15 @@ def FeatureD32 : SubtargetFeature<"d32", "HasD32", "true",
"Extend FP to 32 double registers">;
multiclass VFPver<string name, string query, string description,
- list<SubtargetFeature> prev = [],
- list<SubtargetFeature> otherimplies = []> {
+ list<SubtargetFeature> prev,
+ list<SubtargetFeature> otherimplies,
+ list<SubtargetFeature> vfp2prev = []> {
def _D16_SP: SubtargetFeature<
name#"d16sp", query#"D16SP", "true",
description#" with only 16 d-registers and no double precision",
- !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16_SP")) # otherimplies>;
+ !foreach(v, prev, !cast<SubtargetFeature>(v # "_D16_SP")) #
+ !foreach(v, vfp2prev, !cast<SubtargetFeature>(v # "_SP")) #
+ otherimplies>;
def _SP: SubtargetFeature<
name#"sp", query#"SP", "true",
description#" with no double precision",
@@ -72,6 +75,7 @@ multiclass VFPver<string name, string query, string description,
name#"d16", query#"D16", "true",
description#" with only 16 d-registers",
!foreach(v, prev, !cast<SubtargetFeature>(v # "_D16")) #
+ vfp2prev #
otherimplies # [FeatureFP64, !cast<SubtargetFeature>(NAME # "_D16_SP")]>;
def "": SubtargetFeature<
name, query, "true", description,
@@ -80,11 +84,17 @@ multiclass VFPver<string name, string query, string description,
!cast<SubtargetFeature>(NAME # "_SP")]>;
}
-defm FeatureVFP2: VFPver<"vfp2", "HasVFPv2", "Enable VFP2 instructions",
- [], [FeatureFPRegs]>;
+def FeatureVFP2_SP : SubtargetFeature<"vfp2sp", "HasVFPv2SP", "true",
+ "Enable VFP2 instructions with "
+ "no double precision",
+ [FeatureFPRegs]>;
+
+def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
+ "Enable VFP2 instructions",
+ [FeatureFP64, FeatureVFP2_SP]>;
defm FeatureVFP3: VFPver<"vfp3", "HasVFPv3", "Enable VFP3 instructions",
- [FeatureVFP2]>;
+ [], [], [FeatureVFP2]>;
def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
"Enable NEON instructions",
@@ -98,7 +108,7 @@ defm FeatureVFP4: VFPver<"vfp4", "HasVFPv4", "Enable VFP4 instructions",
[FeatureVFP3], [FeatureFP16]>;
defm FeatureFPARMv8: VFPver<"fp-armv8", "HasFPARMv8", "Enable ARMv8 FP",
- [FeatureVFP4]>;
+ [FeatureVFP4], []>;
def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
"Enable full half-precision "
@@ -302,9 +312,18 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
"Prefer 32-bit Thumb instrs">;
-def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopAlignment","2",
+def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2",
"Prefer 32-bit alignment for loops">;
+def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "1",
+ "Model MVE instructions as a 1 beat per tick architecture">;
+
+def FeatureMVEVectorCostFactor2 : SubtargetFeature<"mve2beat", "MVEVectorCostFactor", "2",
+ "Model MVE instructions as a 2 beats per tick architecture">;
+
+def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "4",
+ "Model MVE instructions as a 4 beats per tick architecture">;
+
/// Some instructions update CPSR partially, which can add false dependency for
/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
/// mapped to a separate physical register. Avoid partial CPSR update for these
@@ -1156,6 +1175,13 @@ def : ProcNoItin<"cortex-a76ae", [ARMv82a, ProcA76,
FeatureFullFP16,
FeatureDotProd]>;
+def : ProcNoItin<"neoverse-n1", [ARMv82a,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureDotProd]>;
+
def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift,
FeatureHasRetAddrStack,
FeatureNEONForFP,
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index e29077266fcd..c8c91e53c44e 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -168,7 +168,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// relatively easy to exceed the thumb branch range within a TU.
if (! ThumbIndirectPads.empty()) {
OutStreamer->EmitAssemblerFlag(MCAF_Code16);
- EmitAlignment(1);
+ EmitAlignment(Align(2));
for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) {
OutStreamer->EmitLabel(TIP.second);
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX)
@@ -203,8 +203,8 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
switch (MO.getType()) {
default: llvm_unreachable("<unknown operand type>");
case MachineOperand::MO_Register: {
- unsigned Reg = MO.getReg();
- assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ Register Reg = MO.getReg();
+ assert(Register::isPhysicalRegister(Reg));
assert(!MO.getSubReg() && "Subregs should be eliminated!");
if(ARM::GPRPairRegClass.contains(Reg)) {
const MachineFunction &MF = *MI->getParent()->getParent();
@@ -275,7 +275,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
return false;
case 'y': // Print a VFP single precision register as indexed double.
if (MI->getOperand(OpNum).isReg()) {
- unsigned Reg = MI->getOperand(OpNum).getReg();
+ Register Reg = MI->getOperand(OpNum).getReg();
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
// Find the 'd' register that has this 's' register as a sub-register,
// and determine the lane number.
@@ -302,14 +302,14 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
if (!MI->getOperand(OpNum).isReg())
return true;
const MachineOperand &MO = MI->getOperand(OpNum);
- unsigned RegBegin = MO.getReg();
+ Register RegBegin = MO.getReg();
// This takes advantage of the 2 operand-ness of ldm/stm and that we've
// already got the operands in registers that are operands to the
// inline asm statement.
O << "{";
if (ARM::GPRPairRegClass.contains(RegBegin)) {
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
- unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
+ Register Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
O << ARMInstPrinter::getRegisterName(Reg0) << ", ";
RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1);
}
@@ -378,8 +378,8 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
if (!MO.isReg())
return true;
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
- unsigned Reg = TRI->getSubReg(MO.getReg(), FirstHalf ?
- ARM::gsub_0 : ARM::gsub_1);
+ Register Reg =
+ TRI->getSubReg(MO.getReg(), FirstHalf ? ARM::gsub_0 : ARM::gsub_1);
O << ARMInstPrinter::getRegisterName(Reg);
return false;
}
@@ -391,7 +391,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
const MachineOperand &MO = MI->getOperand(RegOp);
if (!MO.isReg())
return true;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
O << ARMInstPrinter::getRegisterName(Reg);
return false;
}
@@ -400,12 +400,12 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
case 'f': { // The high doubleword register of a NEON quad register.
if (!MI->getOperand(OpNum).isReg())
return true;
- unsigned Reg = MI->getOperand(OpNum).getReg();
+ Register Reg = MI->getOperand(OpNum).getReg();
if (!ARM::QPRRegClass.contains(Reg))
return true;
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
- unsigned SubReg = TRI->getSubReg(Reg, ExtraCode[0] == 'e' ?
- ARM::dsub_0 : ARM::dsub_1);
+ Register SubReg =
+ TRI->getSubReg(Reg, ExtraCode[0] == 'e' ? ARM::dsub_0 : ARM::dsub_1);
O << ARMInstPrinter::getRegisterName(SubReg);
return false;
}
@@ -419,7 +419,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
return true;
const MachineFunction &MF = *MI->getParent()->getParent();
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if(!ARM::GPRPairRegClass.contains(Reg))
return false;
Reg = TRI->getSubReg(Reg, ARM::gsub_1);
@@ -526,7 +526,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
if (!Stubs.empty()) {
// Switch with ".non_lazy_symbol_pointer" directive.
OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
- EmitAlignment(2);
+ EmitAlignment(Align(4));
for (auto &Stub : Stubs)
emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
@@ -539,7 +539,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
if (!Stubs.empty()) {
// Switch with ".non_lazy_symbol_pointer" directive.
OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection());
- EmitAlignment(2);
+ EmitAlignment(Align(4));
for (auto &Stub : Stubs)
emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
@@ -940,7 +940,7 @@ void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) {
// Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
// ARM mode tables.
- EmitAlignment(2);
+ EmitAlignment(Align(4));
// Emit a label for the jump table.
MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
@@ -986,7 +986,7 @@ void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
// Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
// ARM mode tables.
- EmitAlignment(2);
+ EmitAlignment(Align(4));
// Emit a label for the jump table.
MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
@@ -1015,7 +1015,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
unsigned JTI = MO1.getIndex();
if (Subtarget->isThumb1Only())
- EmitAlignment(2);
+ EmitAlignment(Align(4));
MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
OutStreamer->EmitLabel(JTISymbol);
@@ -1058,7 +1058,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
// Make sure the next instruction is 2-byte aligned.
- EmitAlignment(1);
+ EmitAlignment(Align(2));
}
void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
@@ -1072,7 +1072,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
MF.getSubtarget().getRegisterInfo();
const MachineRegisterInfo &MachineRegInfo = MF.getRegInfo();
- unsigned FramePtr = TargetRegInfo->getFrameRegister(MF);
+ Register FramePtr = TargetRegInfo->getFrameRegister(MF);
unsigned Opc = MI->getOpcode();
unsigned SrcReg, DstReg;
@@ -1136,7 +1136,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
}
// Check for registers that are remapped (for a Thumb1 prologue that
// saves high registers).
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(Reg))
Reg = RemappedReg;
RegList.push_back(Reg);
@@ -1326,7 +1326,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// So here we generate a bl to a small jump pad that does bx rN.
// The jump pads are emitted after the function body.
- unsigned TReg = MI->getOperand(0).getReg();
+ Register TReg = MI->getOperand(0).getReg();
MCSymbol *TRegSym = nullptr;
for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) {
if (TIP.first == TReg) {
@@ -1663,8 +1663,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
case ARM::tTBH_JT: {
bool Is8Bit = MI->getOpcode() == ARM::tTBB_JT;
- unsigned Base = MI->getOperand(0).getReg();
- unsigned Idx = MI->getOperand(1).getReg();
+ Register Base = MI->getOperand(0).getReg();
+ Register Idx = MI->getOperand(1).getReg();
assert(MI->getOperand(1).isKill() && "We need the index register as scratch!");
// Multiply up idx if necessary.
@@ -1844,8 +1844,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// b LSJLJEH
// movs r0, #1
// LSJLJEH:
- unsigned SrcReg = MI->getOperand(0).getReg();
- unsigned ValReg = MI->getOperand(1).getReg();
+ Register SrcReg = MI->getOperand(0).getReg();
+ Register ValReg = MI->getOperand(1).getReg();
MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true);
OutStreamer->AddComment("eh_setjmp begin");
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
@@ -1910,8 +1910,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// mov r0, #0
// add pc, pc, #0
// mov r0, #1
- unsigned SrcReg = MI->getOperand(0).getReg();
- unsigned ValReg = MI->getOperand(1).getReg();
+ Register SrcReg = MI->getOperand(0).getReg();
+ Register ValReg = MI->getOperand(1).getReg();
OutStreamer->AddComment("eh_setjmp begin");
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDri)
@@ -1967,8 +1967,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// ldr $scratch, [$src, #4]
// ldr r7, [$src]
// bx $scratch
- unsigned SrcReg = MI->getOperand(0).getReg();
- unsigned ScratchReg = MI->getOperand(1).getReg();
+ Register SrcReg = MI->getOperand(0).getReg();
+ Register ScratchReg = MI->getOperand(1).getReg();
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
.addReg(ARM::SP)
.addReg(SrcReg)
@@ -2027,8 +2027,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// ldr $scratch, [$src, #4]
// ldr r7, [$src]
// bx $scratch
- unsigned SrcReg = MI->getOperand(0).getReg();
- unsigned ScratchReg = MI->getOperand(1).getReg();
+ Register SrcReg = MI->getOperand(0).getReg();
+ Register ScratchReg = MI->getOperand(1).getReg();
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
.addReg(ScratchReg)
@@ -2095,7 +2095,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// ldr.w sp, [$src, #8]
// ldr.w pc, [$src, #4]
- unsigned SrcReg = MI->getOperand(0).getReg();
+ Register SrcReg = MI->getOperand(0).getReg();
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
.addReg(ARM::R11)
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 222aa85856a2..684cd1def977 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -172,9 +172,9 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
const MachineOperand &WB = isLoad ? MI.getOperand(1) : MI.getOperand(0);
const MachineOperand &Base = MI.getOperand(2);
const MachineOperand &Offset = MI.getOperand(NumOps - 3);
- unsigned WBReg = WB.getReg();
- unsigned BaseReg = Base.getReg();
- unsigned OffReg = Offset.getReg();
+ Register WBReg = WB.getReg();
+ Register BaseReg = Base.getReg();
+ Register OffReg = Offset.getReg();
unsigned OffImm = MI.getOperand(NumOps - 2).getImm();
ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI.getOperand(NumOps - 1).getImm();
switch (AddrMode) {
@@ -276,8 +276,8 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
if (LV) {
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI.getOperand(i);
- if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
- unsigned Reg = MO.getReg();
+ if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) {
+ Register Reg = MO.getReg();
LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
if (MO.isDef()) {
@@ -966,8 +966,8 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
SmallSet<unsigned, 4> DstRegs;
#endif
for (unsigned i = 0; i != SubRegs; ++i) {
- unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
- unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
+ Register Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
+ Register Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
assert(Dst && Src && "Bad sub-register");
#ifndef NDEBUG
assert(!DstRegs.count(Src) && "destructive vector copy");
@@ -1019,7 +1019,7 @@ ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
if (!SubIdx)
return MIB.addReg(Reg, State);
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
return MIB.addReg(Reg, State, SubIdx);
}
@@ -1133,7 +1133,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
case 24:
if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
// Use aligned spills if the stack can be realigned.
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+ Subtarget.hasNEON()) {
BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64TPseudo))
.addFrameIndex(FI)
.addImm(16)
@@ -1155,7 +1156,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 32:
if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+ Subtarget.hasNEON()) {
// FIXME: It's possible to only store part of the QQ register if the
// spilled def has a sub-register index.
BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64QPseudo))
@@ -1337,7 +1339,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
}
- if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ if (Register::isPhysicalRegister(DestReg))
MIB.addReg(DestReg, RegState::ImplicitDefine);
} else
llvm_unreachable("Unknown reg class!");
@@ -1368,7 +1370,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 24:
if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+ Subtarget.hasNEON()) {
BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
.addFrameIndex(FI)
.addImm(16)
@@ -1382,7 +1385,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
- if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ if (Register::isPhysicalRegister(DestReg))
MIB.addReg(DestReg, RegState::ImplicitDefine);
}
} else
@@ -1390,7 +1393,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 32:
if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+ Subtarget.hasNEON()) {
BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
.addFrameIndex(FI)
.addImm(16)
@@ -1405,7 +1409,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
- if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ if (Register::isPhysicalRegister(DestReg))
MIB.addReg(DestReg, RegState::ImplicitDefine);
}
} else
@@ -1425,7 +1429,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI);
MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI);
MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI);
- if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ if (Register::isPhysicalRegister(DestReg))
MIB.addReg(DestReg, RegState::ImplicitDefine);
} else
llvm_unreachable("Unknown reg class!");
@@ -1583,8 +1587,8 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// Look for a copy between even S-registers. That is where we keep floats
// when using NEON v2f32 instructions for f32 arithmetic.
- unsigned DstRegS = MI.getOperand(0).getReg();
- unsigned SrcRegS = MI.getOperand(1).getReg();
+ Register DstRegS = MI.getOperand(0).getReg();
+ Register SrcRegS = MI.getOperand(1).getReg();
if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS))
return false;
@@ -1794,12 +1798,11 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0,
if (MI0.getNumOperands() != MI1.getNumOperands())
return false;
- unsigned Addr0 = MI0.getOperand(1).getReg();
- unsigned Addr1 = MI1.getOperand(1).getReg();
+ Register Addr0 = MI0.getOperand(1).getReg();
+ Register Addr1 = MI1.getOperand(1).getReg();
if (Addr0 != Addr1) {
- if (!MRI ||
- !TargetRegisterInfo::isVirtualRegister(Addr0) ||
- !TargetRegisterInfo::isVirtualRegister(Addr1))
+ if (!MRI || !Register::isVirtualRegister(Addr0) ||
+ !Register::isVirtualRegister(Addr1))
return false;
// This assumes SSA form.
@@ -2076,6 +2079,38 @@ isProfitableToIfCvt(MachineBasicBlock &TBB,
return PredCost <= UnpredCost;
}
+unsigned
+ARMBaseInstrInfo::extraSizeToPredicateInstructions(const MachineFunction &MF,
+ unsigned NumInsts) const {
+ // Thumb2 needs a 2-byte IT instruction to predicate up to 4 instructions.
+ // ARM has a condition code field in every predicable instruction, using it
+ // doesn't change code size.
+ return Subtarget.isThumb2() ? divideCeil(NumInsts, 4) * 2 : 0;
+}
+
+unsigned
+ARMBaseInstrInfo::predictBranchSizeForIfCvt(MachineInstr &MI) const {
+ // If this branch is likely to be folded into the comparison to form a
+ // CB(N)Z, then removing it won't reduce code size at all, because that will
+ // just replace the CB(N)Z with a CMP.
+ if (MI.getOpcode() == ARM::t2Bcc &&
+ findCMPToFoldIntoCBZ(&MI, &getRegisterInfo()))
+ return 0;
+
+ unsigned Size = getInstSizeInBytes(MI);
+
+ // For Thumb2, all branches are 32-bit instructions during the if conversion
+ // pass, but may be replaced with 16-bit instructions during size reduction.
+ // Since the branches considered by if conversion tend to be forward branches
+ // over small basic blocks, they are very likely to be in range for the
+ // narrow instructions, so we assume the final code size will be half what it
+ // currently is.
+ if (Subtarget.isThumb2())
+ Size /= 2;
+
+ return Size;
+}
+
bool
ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
MachineBasicBlock &FMBB) const {
@@ -2141,7 +2176,7 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
MachineInstr *
ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI,
const TargetInstrInfo *TII) const {
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
return nullptr;
if (!MRI.hasOneNonDBGUse(Reg))
return nullptr;
@@ -2163,7 +2198,7 @@ ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI,
// MI can't have any tied operands, that would conflict with predication.
if (MO.isTied())
return nullptr;
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ if (Register::isPhysicalRegister(MO.getReg()))
return nullptr;
if (MO.isDef() && !MO.isDead())
return nullptr;
@@ -2211,7 +2246,7 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI,
// Find new register class to use.
MachineOperand FalseReg = MI.getOperand(Invert ? 2 : 1);
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
if (!MRI.constrainRegClass(DestReg, PreviousClass))
return nullptr;
@@ -2298,6 +2333,7 @@ static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
{ARM::tSUBSrr, ARM::tSUBrr},
{ARM::tSBCS, ARM::tSBC},
{ARM::tRSBS, ARM::tRSB},
+ {ARM::tLSLSri, ARM::tLSLri},
{ARM::t2ADDSri, ARM::t2ADDri},
{ARM::t2ADDSrr, ARM::t2ADDrr},
@@ -2420,7 +2456,8 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
MachineOperand &MO = MI->getOperand(i);
RegList.push_back(MO);
- if (MO.isReg() && TRI->getEncodingValue(MO.getReg()) < FirstRegEnc)
+ if (MO.isReg() && !MO.isImplicit() &&
+ TRI->getEncodingValue(MO.getReg()) < FirstRegEnc)
FirstRegEnc = TRI->getEncodingValue(MO.getReg());
}
@@ -2430,7 +2467,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
for (int CurRegEnc = FirstRegEnc - 1; CurRegEnc >= 0 && RegsNeeded;
--CurRegEnc) {
unsigned CurReg = RegClass->getRegister(CurRegEnc);
- if (IsT1PushPop && CurReg > ARM::R7)
+ if (IsT1PushPop && CurRegEnc > TRI->getEncodingValue(ARM::R7))
continue;
if (!IsPop) {
// Pushing any register is completely harmless, mark the register involved
@@ -3039,18 +3076,22 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
break;
case ARM::VSELEQD:
case ARM::VSELEQS:
+ case ARM::VSELEQH:
CC = ARMCC::EQ;
break;
case ARM::VSELGTD:
case ARM::VSELGTS:
+ case ARM::VSELGTH:
CC = ARMCC::GT;
break;
case ARM::VSELGED:
case ARM::VSELGES:
+ case ARM::VSELGEH:
CC = ARMCC::GE;
break;
- case ARM::VSELVSS:
case ARM::VSELVSD:
+ case ARM::VSELVSS:
+ case ARM::VSELVSH:
CC = ARMCC::VS;
break;
}
@@ -3271,9 +3312,9 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
}
unsigned OpIdx = Commute ? 2 : 1;
- unsigned Reg1 = UseMI.getOperand(OpIdx).getReg();
+ Register Reg1 = UseMI.getOperand(OpIdx).getReg();
bool isKill = UseMI.getOperand(OpIdx).isKill();
- unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+ Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(NewUseOpc),
NewReg)
.addReg(Reg1, getKillRegState(isKill))
@@ -3335,15 +3376,15 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
case ARM::LDRSB_POST:
case ARM::LDRSH_POST: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rm = MI.getOperand(3).getReg();
return (Rt == Rm) ? 4 : 3;
}
case ARM::LDR_PRE_REG:
case ARM::LDRB_PRE_REG: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rm = MI.getOperand(3).getReg();
if (Rt == Rm)
return 3;
unsigned ShOpVal = MI.getOperand(4).getImm();
@@ -3372,8 +3413,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
case ARM::LDRH_PRE:
case ARM::STRH_PRE: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rm = MI.getOperand(3).getReg();
if (!Rm)
return 2;
if (Rt == Rm)
@@ -3384,8 +3425,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
case ARM::LDR_POST_REG:
case ARM::LDRB_POST_REG:
case ARM::LDRH_POST: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rm = MI.getOperand(3).getReg();
return (Rt == Rm) ? 3 : 2;
}
@@ -3404,10 +3445,10 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
case ARM::LDRSB_PRE:
case ARM::LDRSH_PRE: {
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rm = MI.getOperand(3).getReg();
if (Rm == 0)
return 3;
- unsigned Rt = MI.getOperand(0).getReg();
+ Register Rt = MI.getOperand(0).getReg();
if (Rt == Rm)
return 4;
unsigned ShOpVal = MI.getOperand(4).getImm();
@@ -3422,9 +3463,9 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
}
case ARM::LDRD: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rn = MI.getOperand(2).getReg();
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rn = MI.getOperand(2).getReg();
+ Register Rm = MI.getOperand(3).getReg();
if (Rm)
return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
: 3;
@@ -3432,7 +3473,7 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
}
case ARM::STRD: {
- unsigned Rm = MI.getOperand(3).getReg();
+ Register Rm = MI.getOperand(3).getReg();
if (Rm)
return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
: 3;
@@ -3448,9 +3489,9 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
return 4;
case ARM::LDRD_PRE: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rn = MI.getOperand(3).getReg();
- unsigned Rm = MI.getOperand(4).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rn = MI.getOperand(3).getReg();
+ Register Rm = MI.getOperand(4).getReg();
if (Rm)
return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
: 4;
@@ -3458,13 +3499,13 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
}
case ARM::t2LDRD_PRE: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rn = MI.getOperand(3).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rn = MI.getOperand(3).getReg();
return (Rt == Rn) ? 4 : 3;
}
case ARM::STRD_PRE: {
- unsigned Rm = MI.getOperand(4).getReg();
+ Register Rm = MI.getOperand(4).getReg();
if (Rm)
return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
: 4;
@@ -3495,8 +3536,8 @@ static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
return 2;
case ARM::t2LDRDi8: {
- unsigned Rt = MI.getOperand(0).getReg();
- unsigned Rn = MI.getOperand(2).getReg();
+ Register Rt = MI.getOperand(0).getReg();
+ Register Rn = MI.getOperand(2).getReg();
return (Rt == Rn) ? 3 : 2;
}
@@ -3745,7 +3786,7 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
}
bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const {
- unsigned BaseReg = MI.getOperand(0).getReg();
+ Register BaseReg = MI.getOperand(0).getReg();
for (unsigned i = 1, sz = MI.getNumOperands(); i < sz; ++i) {
const auto &Op = MI.getOperand(i);
if (Op.isReg() && Op.getReg() == BaseReg)
@@ -4219,7 +4260,7 @@ int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
return -1;
const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
- unsigned Reg = DefMO.getReg();
+ Register Reg = DefMO.getReg();
const MachineInstr *ResolvedDefMI = &DefMI;
unsigned DefAdj = 0;
@@ -4328,10 +4369,10 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
}
const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
- const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode);
+ auto *DefMN = cast<MachineSDNode>(DefNode);
unsigned DefAlign = !DefMN->memoperands_empty()
? (*DefMN->memoperands_begin())->getAlignment() : 0;
- const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode);
+ auto *UseMN = cast<MachineSDNode>(UseNode);
unsigned UseAlign = !UseMN->memoperands_empty()
? (*UseMN->memoperands_begin())->getAlignment() : 0;
int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign,
@@ -4708,7 +4749,7 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
if (MI.getOperand(i).isImplicit() ||
!MI.getOperand(i).isReg())
continue;
- unsigned Reg = MI.getOperand(i).getReg();
+ Register Reg = MI.getOperand(i).getReg();
if (Reg < ARM::R0 || Reg > ARM::R7) {
if (!(MI.getOpcode() == ARM::tPUSH && Reg == ARM::LR) &&
!(MI.getOpcode() == ARM::tPOP_RET && Reg == ARM::PC)) {
@@ -4731,7 +4772,7 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
- unsigned Reg = MI->getOperand(0).getReg();
+ Register Reg = MI->getOperand(0).getReg();
const GlobalValue *GV =
cast<GlobalValue>((*MI->memoperands_begin())->getValue());
MachineInstrBuilder MIB;
@@ -5104,7 +5145,7 @@ unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance(
const MachineOperand &MO = MI.getOperand(OpNum);
if (MO.readsReg())
return 0;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
int UseOp = -1;
switch (MI.getOpcode()) {
@@ -5134,7 +5175,7 @@ unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance(
return 0;
// We must be able to clobber the whole D-reg.
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
// Virtual register must be a def undef foo:ssub_0 operand.
if (!MO.getSubReg() || MI.readsVirtualRegister(Reg))
return 0;
@@ -5159,8 +5200,8 @@ void ARMBaseInstrInfo::breakPartialRegDependency(
assert(TRI && "Need TRI instance");
const MachineOperand &MO = MI.getOperand(OpNum);
- unsigned Reg = MO.getReg();
- assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+ Register Reg = MO.getReg();
+ assert(Register::isPhysicalRegister(Reg) &&
"Can't break virtual register dependencies.");
unsigned DReg = Reg;
@@ -5337,7 +5378,7 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br,
// is not redefined between the cmp and the br.
if (CmpMI->getOpcode() != ARM::tCMPi8 && CmpMI->getOpcode() != ARM::t2CMPri)
return nullptr;
- unsigned Reg = CmpMI->getOperand(0).getReg();
+ Register Reg = CmpMI->getOperand(0).getReg();
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(*CmpMI, PredReg);
if (Pred != ARMCC::AL || CmpMI->getOperand(1).getImm() != 0)
@@ -5349,3 +5390,50 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br,
return &*CmpMI;
}
+
+unsigned llvm::ConstantMaterializationCost(unsigned Val,
+ const ARMSubtarget *Subtarget,
+ bool ForCodesize) {
+ if (Subtarget->isThumb()) {
+ if (Val <= 255) // MOV
+ return ForCodesize ? 2 : 1;
+ if (Subtarget->hasV6T2Ops() && (Val <= 0xffff || // MOV
+ ARM_AM::getT2SOImmVal(Val) != -1 || // MOVW
+ ARM_AM::getT2SOImmVal(~Val) != -1)) // MVN
+ return ForCodesize ? 4 : 1;
+ if (Val <= 510) // MOV + ADDi8
+ return ForCodesize ? 4 : 2;
+ if (~Val <= 255) // MOV + MVN
+ return ForCodesize ? 4 : 2;
+ if (ARM_AM::isThumbImmShiftedVal(Val)) // MOV + LSL
+ return ForCodesize ? 4 : 2;
+ } else {
+ if (ARM_AM::getSOImmVal(Val) != -1) // MOV
+ return ForCodesize ? 4 : 1;
+ if (ARM_AM::getSOImmVal(~Val) != -1) // MVN
+ return ForCodesize ? 4 : 1;
+ if (Subtarget->hasV6T2Ops() && Val <= 0xffff) // MOVW
+ return ForCodesize ? 4 : 1;
+ if (ARM_AM::isSOImmTwoPartVal(Val)) // two instrs
+ return ForCodesize ? 8 : 2;
+ }
+ if (Subtarget->useMovt()) // MOVW + MOVT
+ return ForCodesize ? 8 : 2;
+ return ForCodesize ? 8 : 3; // Literal pool load
+}
+
+bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
+ const ARMSubtarget *Subtarget,
+ bool ForCodesize) {
+ // Check with ForCodesize
+ unsigned Cost1 = ConstantMaterializationCost(Val1, Subtarget, ForCodesize);
+ unsigned Cost2 = ConstantMaterializationCost(Val2, Subtarget, ForCodesize);
+ if (Cost1 < Cost2)
+ return true;
+ if (Cost1 > Cost2)
+ return false;
+
+ // If they are equal, try with !ForCodesize
+ return ConstantMaterializationCost(Val1, Subtarget, !ForCodesize) <
+ ConstantMaterializationCost(Val2, Subtarget, !ForCodesize);
+}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index c28983fcc15c..c232b6f0b45d 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -276,6 +276,10 @@ public:
return NumCycles == 1;
}
+ unsigned extraSizeToPredicateInstructions(const MachineFunction &MF,
+ unsigned NumInsts) const override;
+ unsigned predictBranchSizeForIfCvt(MachineInstr &MI) const override;
+
bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
MachineBasicBlock &FMBB) const override;
@@ -601,7 +605,8 @@ bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
unsigned FrameReg, int &Offset,
- const ARMBaseInstrInfo &TII);
+ const ARMBaseInstrInfo &TII,
+ const TargetRegisterInfo *TRI);
/// Return true if Reg is defd between From and To
bool registerDefinedBetween(unsigned Reg, MachineBasicBlock::iterator From,
@@ -620,6 +625,20 @@ void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond);
void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond,
unsigned Inactive);
+/// Returns the number of instructions required to materialize the given
+/// constant in a register, or 3 if a literal pool load is needed.
+/// If ForCodesize is specified, an approximate cost in bytes is returned.
+unsigned ConstantMaterializationCost(unsigned Val,
+ const ARMSubtarget *Subtarget,
+ bool ForCodesize = false);
+
+/// Returns true if Val1 has a lower Constant Materialization Cost than Val2.
+/// Uses the cost from ConstantMaterializationCost, first with ForCodesize as
+/// specified. If the scores are equal, return the comparison for !ForCodesize.
+bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
+ const ARMSubtarget *Subtarget,
+ bool ForCodesize = false);
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index dc99b37742da..1eaf871867e0 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -174,6 +174,12 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
: CSR_AAPCS_ThisReturn_RegMask;
}
+ArrayRef<MCPhysReg> ARMBaseRegisterInfo::getIntraCallClobberedRegs(
+ const MachineFunction *MF) const {
+ static const MCPhysReg IntraCallClobberedRegs[] = {ARM::R12};
+ return ArrayRef<MCPhysReg>(IntraCallClobberedRegs);
+}
+
BitVector ARMBaseRegisterInfo::
getReservedRegs(const MachineFunction &MF) const {
const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
@@ -185,7 +191,7 @@ getReservedRegs(const MachineFunction &MF) const {
markSuperRegs(Reserved, ARM::PC);
markSuperRegs(Reserved, ARM::FPSCR);
markSuperRegs(Reserved, ARM::APSR_NZCV);
- if (TFI->hasFP(MF))
+ if (TFI->hasFP(MF) || STI.isTargetDarwin())
markSuperRegs(Reserved, getFramePointerReg(STI));
if (hasBasePointer(MF))
markSuperRegs(Reserved, BasePtr);
@@ -217,7 +223,7 @@ isAsmClobberable(const MachineFunction &MF, unsigned PhysReg) const {
const TargetRegisterClass *
ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
- const MachineFunction &) const {
+ const MachineFunction &MF) const {
const TargetRegisterClass *Super = RC;
TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
do {
@@ -225,11 +231,13 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
case ARM::GPRRegClassID:
case ARM::SPRRegClassID:
case ARM::DPRRegClassID:
+ case ARM::GPRPairRegClassID:
+ return Super;
case ARM::QPRRegClassID:
case ARM::QQPRRegClassID:
case ARM::QQQQPRRegClassID:
- case ARM::GPRPairRegClassID:
- return Super;
+ if (MF.getSubtarget<ARMSubtarget>().hasNEON())
+ return Super;
}
Super = *I++;
} while (Super);
@@ -317,7 +325,7 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
return false;
unsigned PairedPhys = 0;
- if (TargetRegisterInfo::isPhysicalRegister(Paired)) {
+ if (Register::isPhysicalRegister(Paired)) {
PairedPhys = Paired;
} else if (VRM && VRM->hasPhys(Paired)) {
PairedPhys = getPairedGPR(VRM->getPhys(Paired), Odd, this);
@@ -347,7 +355,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
Hint.first == (unsigned)ARMRI::RegPairEven) &&
- TargetRegisterInfo::isVirtualRegister(Hint.second)) {
+ Register::isVirtualRegister(Hint.second)) {
// If 'Reg' is one of the even / odd register pair and it's now changed
// (e.g. coalesced) into a different register. The other register of the
// pair allocation hint must be updated to reflect the relationship
@@ -357,7 +365,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
// Make sure the pair has not already divorced.
if (Hint.second == Reg) {
MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
- if (TargetRegisterInfo::isVirtualRegister(NewReg))
+ if (Register::isVirtualRegister(NewReg))
MRI->setRegAllocationHint(NewReg,
Hint.first == (unsigned)ARMRI::RegPairOdd ? ARMRI::RegPairEven
: ARMRI::RegPairOdd, OtherReg);
@@ -663,7 +671,7 @@ void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII);
else {
assert(AFI->isThumb2Function());
- Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII);
+ Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII, this);
}
assert(Done && "Unable to resolve frame index!");
(void)Done;
@@ -775,7 +783,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
Done = rewriteARMFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
else {
assert(AFI->isThumb2Function());
- Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
+ Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII, this);
}
if (Done)
return;
@@ -783,21 +791,32 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// If we get here, the immediate doesn't fit into the instruction. We folded
// as much as possible above, handle the rest, providing a register that is
// SP+LargeImm.
- assert((Offset ||
- (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 ||
- (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) &&
- "This code isn't needed if offset already handled!");
+ assert(
+ (Offset ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6 ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrModeT2_i7 ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrModeT2_i7s2 ||
+ (MI.getDesc().TSFlags & ARMII::AddrModeMask) ==
+ ARMII::AddrModeT2_i7s4) &&
+ "This code isn't needed if offset already handled!");
unsigned ScratchReg = 0;
int PIdx = MI.findFirstPredOperandIdx();
ARMCC::CondCodes Pred = (PIdx == -1)
? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg();
- if (Offset == 0)
+
+ const MCInstrDesc &MCID = MI.getDesc();
+ const TargetRegisterClass *RegClass =
+ TII.getRegClass(MCID, FIOperandNum, this, *MI.getParent()->getParent());
+
+ if (Offset == 0 &&
+ (Register::isVirtualRegister(FrameReg) || RegClass->contains(FrameReg)))
// Must be addrmode4/6.
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false);
else {
- ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass);
+ ScratchReg = MF.getRegInfo().createVirtualRegister(RegClass);
if (!AFI->isThumbFunction())
emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
Offset, Pred, PredReg, TII);
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 7e2c72b4d712..477f3ad0a9a7 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -129,6 +129,9 @@ public:
const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF,
CallingConv::ID) const;
+ ArrayRef<MCPhysReg>
+ getIntraCallClobberedRegs(const MachineFunction *MF) const override;
+
BitVector getReservedRegs(const MachineFunction &MF) const override;
bool isAsmClobberable(const MachineFunction &MF,
unsigned PhysReg) const override;
@@ -176,8 +179,6 @@ public:
Register getFrameRegister(const MachineFunction &MF) const override;
unsigned getBaseRegister() const { return BasePtr; }
- bool isLowRegister(unsigned Reg) const;
-
/// emitLoadConstPool - Emits a load from constpool to materialize the
/// specified immediate.
diff --git a/lib/Target/ARM/ARMBasicBlockInfo.cpp b/lib/Target/ARM/ARMBasicBlockInfo.cpp
index 2de90e816b33..00a2231f59e3 100644
--- a/lib/Target/ARM/ARMBasicBlockInfo.cpp
+++ b/lib/Target/ARM/ARMBasicBlockInfo.cpp
@@ -6,14 +6,16 @@
//
//===----------------------------------------------------------------------===//
+#include "ARMBasicBlockInfo.h"
#include "ARM.h"
#include "ARMBaseInstrInfo.h"
-#include "ARMBasicBlockInfo.h"
#include "ARMMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/Support/Debug.h"
#include <vector>
#define DEBUG_TYPE "arm-bb-utils"
@@ -47,7 +49,7 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) {
BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
BBI.Size = 0;
BBI.Unalign = 0;
- BBI.PostAlign = 0;
+ BBI.PostAlign = Align::None();
for (MachineInstr &I : *MBB) {
BBI.Size += TII->getInstSizeInBytes(I);
@@ -62,8 +64,8 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) {
// tBR_JTr contains a .align 2 directive.
if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) {
- BBI.PostAlign = 2;
- MBB->getParent()->ensureAlignment(2);
+ BBI.PostAlign = Align(4);
+ MBB->getParent()->ensureAlignment(Align(4));
}
}
@@ -126,9 +128,9 @@ void ARMBasicBlockUtils::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
for(unsigned i = BBNum + 1, e = MF.getNumBlockIDs(); i < e; ++i) {
// Get the offset and known bits at the end of the layout predecessor.
// Include the alignment of the current block.
- unsigned LogAlign = MF.getBlockNumbered(i)->getAlignment();
- unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
- unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
+ const Align Align = MF.getBlockNumbered(i)->getAlignment();
+ const unsigned Offset = BBInfo[i - 1].postOffset(Align);
+ const unsigned KnownBits = BBInfo[i - 1].postKnownBits(Align);
// This is where block i begins. Stop if the offset is already correct,
// and we have updated 2 blocks. This is the maximum number of blocks
diff --git a/lib/Target/ARM/ARMBasicBlockInfo.h b/lib/Target/ARM/ARMBasicBlockInfo.h
index 400bba351cec..13df399ed995 100644
--- a/lib/Target/ARM/ARMBasicBlockInfo.h
+++ b/lib/Target/ARM/ARMBasicBlockInfo.h
@@ -21,17 +21,18 @@
namespace llvm {
+struct BasicBlockInfo;
using BBInfoVector = SmallVectorImpl<BasicBlockInfo>;
/// UnknownPadding - Return the worst case padding that could result from
/// unknown offset bits. This does not include alignment padding caused by
/// known offset bits.
///
-/// @param LogAlign log2(alignment)
+/// @param Alignment alignment
/// @param KnownBits Number of known low offset bits.
-inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
- if (KnownBits < LogAlign)
- return (1u << LogAlign) - (1u << KnownBits);
+inline unsigned UnknownPadding(Align Alignment, unsigned KnownBits) {
+ if (KnownBits < Log2(Alignment))
+ return Alignment.value() - (1ull << KnownBits);
return 0;
}
@@ -65,10 +66,9 @@ struct BasicBlockInfo {
/// multiple of 1 << Unalign.
uint8_t Unalign = 0;
- /// PostAlign - When non-zero, the block terminator contains a .align
- /// directive, so the end of the block is aligned to 1 << PostAlign
- /// bytes.
- uint8_t PostAlign = 0;
+ /// PostAlign - When > 1, the block terminator contains a .align
+ /// directive, so the end of the block is aligned to PostAlign bytes.
+ Align PostAlign;
BasicBlockInfo() = default;
@@ -84,16 +84,16 @@ struct BasicBlockInfo {
return Bits;
}
- /// Compute the offset immediately following this block. If LogAlign is
+ /// Compute the offset immediately following this block. If Align is
/// specified, return the offset the successor block will get if it has
/// this alignment.
- unsigned postOffset(unsigned LogAlign = 0) const {
+ unsigned postOffset(Align Alignment = Align::None()) const {
unsigned PO = Offset + Size;
- unsigned LA = std::max(unsigned(PostAlign), LogAlign);
- if (!LA)
+ const Align PA = std::max(PostAlign, Alignment);
+ if (PA == Align::None())
return PO;
// Add alignment padding from the terminator.
- return PO + UnknownPadding(LA, internalKnownBits());
+ return PO + UnknownPadding(PA, internalKnownBits());
}
/// Compute the number of known low bits of postOffset. If this block
@@ -101,9 +101,8 @@ struct BasicBlockInfo {
/// instruction alignment. An aligned terminator may increase the number
/// of know bits.
/// If LogAlign is given, also consider the alignment of the next block.
- unsigned postKnownBits(unsigned LogAlign = 0) const {
- return std::max(std::max(unsigned(PostAlign), LogAlign),
- internalKnownBits());
+ unsigned postKnownBits(Align Align = Align::None()) const {
+ return std::max(Log2(std::max(PostAlign, Align)), internalKnownBits());
}
};
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 0cbe6e1871e4..d3b595ce8323 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -90,6 +90,8 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
: ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+ bool isIncomingArgumentHandler() const override { return false; }
+
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
@@ -169,8 +171,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info, CCState &State) override {
- if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State))
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+ CCState &State) override {
+ if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State))
return true;
StackSize =
@@ -199,9 +202,8 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
if (SplitVTs.size() == 1) {
// Even if there is no splitting to do, we still want to replace the
// original type (e.g. pointer type -> integer).
- auto Flags = OrigArg.Flags;
- unsigned OriginalAlignment = DL.getABITypeAlignment(OrigArg.Ty);
- Flags.setOrigAlign(OriginalAlignment);
+ auto Flags = OrigArg.Flags[0];
+ Flags.setOrigAlign(Align(DL.getABITypeAlignment(OrigArg.Ty)));
SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
Flags, OrigArg.IsFixed);
return;
@@ -211,10 +213,9 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) {
EVT SplitVT = SplitVTs[i];
Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
- auto Flags = OrigArg.Flags;
+ auto Flags = OrigArg.Flags[0];
- unsigned OriginalAlignment = DL.getABITypeAlignment(SplitTy);
- Flags.setOrigAlign(OriginalAlignment);
+ Flags.setOrigAlign(Align(DL.getABITypeAlignment(SplitTy)));
bool NeedsConsecutiveRegisters =
TLI.functionArgumentNeedsConsecutiveRegisters(
@@ -286,7 +287,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
CCAssignFn AssignFn)
: ValueHandler(MIRBuilder, MRI, AssignFn) {}
- bool isArgumentHandler() const override { return true; }
+ bool isIncomingArgumentHandler() const override { return true; }
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
@@ -298,7 +299,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
int FI = MFI.CreateFixedObject(Size, Offset, true);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
- unsigned AddrReg =
+ Register AddrReg =
MRI.createGenericVirtualRegister(LLT::pointer(MPO.getAddrSpace(), 32));
MIRBuilder.buildFrameIndex(AddrReg, FI);
@@ -405,6 +406,7 @@ struct FormalArgHandler : public IncomingValueHandler {
: IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
void markPhysRegUsed(unsigned PhysReg) override {
+ MIRBuilder.getMRI()->addLiveIn(PhysReg);
MIRBuilder.getMBB().addLiveIn(PhysReg);
}
};
@@ -498,11 +500,7 @@ unsigned getCallOpcode(const ARMSubtarget &STI, bool isDirect) {
}
} // end anonymous namespace
-bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
- CallingConv::ID CallConv,
- const MachineOperand &Callee,
- const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const {
+bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const {
MachineFunction &MF = MIRBuilder.getMF();
const auto &TLI = *getTLI<ARMTargetLowering>();
const auto &DL = MF.getDataLayout();
@@ -520,7 +518,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Create the call instruction so we can add the implicit uses of arg
// registers, but don't insert it yet.
- bool IsDirect = !Callee.isReg();
+ bool IsDirect = !Info.Callee.isReg();
auto CallOpcode = getCallOpcode(STI, IsDirect);
auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode);
@@ -528,35 +526,35 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (IsThumb)
MIB.add(predOps(ARMCC::AL));
- MIB.add(Callee);
+ MIB.add(Info.Callee);
if (!IsDirect) {
- auto CalleeReg = Callee.getReg();
- if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg)) {
+ auto CalleeReg = Info.Callee.getReg();
+ if (CalleeReg && !Register::isPhysicalRegister(CalleeReg)) {
unsigned CalleeIdx = IsThumb ? 2 : 0;
MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(),
- *MIB.getInstr(), MIB->getDesc(), Callee, CalleeIdx));
+ *MIB.getInstr(), MIB->getDesc(), Info.Callee, CalleeIdx));
}
}
- MIB.addRegMask(TRI->getCallPreservedMask(MF, CallConv));
+ MIB.addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
bool IsVarArg = false;
SmallVector<ArgInfo, 8> ArgInfos;
- for (auto Arg : OrigArgs) {
+ for (auto Arg : Info.OrigArgs) {
if (!isSupportedType(DL, TLI, Arg.Ty))
return false;
if (!Arg.IsFixed)
IsVarArg = true;
- if (Arg.Flags.isByVal())
+ if (Arg.Flags[0].isByVal())
return false;
splitToValueTypes(Arg, ArgInfos, MF);
}
- auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, IsVarArg);
+ auto ArgAssignFn = TLI.CCAssignFnForCall(Info.CallConv, IsVarArg);
OutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn);
if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
return false;
@@ -564,13 +562,13 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Now we can add the actual call instruction to the correct basic block.
MIRBuilder.insertInstr(MIB);
- if (!OrigRet.Ty->isVoidTy()) {
- if (!isSupportedType(DL, TLI, OrigRet.Ty))
+ if (!Info.OrigRet.Ty->isVoidTy()) {
+ if (!isSupportedType(DL, TLI, Info.OrigRet.Ty))
return false;
ArgInfos.clear();
- splitToValueTypes(OrigRet, ArgInfos, MF);
- auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, IsVarArg);
+ splitToValueTypes(Info.OrigRet, ArgInfos, MF);
+ auto RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, IsVarArg);
CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn);
if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
return false;
diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h
index 794127b5ebc7..ddbc9feb90e2 100644
--- a/lib/Target/ARM/ARMCallLowering.h
+++ b/lib/Target/ARM/ARMCallLowering.h
@@ -38,9 +38,8 @@ public:
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const override;
- bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
- const MachineOperand &Callee, const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const override;
+ bool lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const override;
private:
bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
diff --git a/lib/Target/ARM/ARMCallingConv.cpp b/lib/Target/ARM/ARMCallingConv.cpp
index 5ede7c67f7c2..92ebc542b423 100644
--- a/lib/Target/ARM/ARMCallingConv.cpp
+++ b/lib/Target/ARM/ARMCallingConv.cpp
@@ -193,7 +193,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
// Try to allocate a contiguous block of registers, each of the correct
// size to hold one member.
auto &DL = State.getMachineFunction().getDataLayout();
- unsigned StackAlign = DL.getStackAlignment();
+ unsigned StackAlign = DL.getStackAlignment().value();
unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
ArrayRef<MCPhysReg> RegList;
diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp
index 2fc5f4aaab50..1c2c8aef55bb 100644
--- a/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -179,16 +179,12 @@ public:
}
static bool GenerateSignBits(Value *V) {
- if (auto *Arg = dyn_cast<Argument>(V))
- return Arg->hasSExtAttr();
-
if (!isa<Instruction>(V))
return false;
unsigned Opc = cast<Instruction>(V)->getOpcode();
return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
- Opc == Instruction::SRem || Opc == Instruction::SExt ||
- Opc == Instruction::SIToFP;
+ Opc == Instruction::SRem || Opc == Instruction::SExt;
}
static bool EqualTypeSize(Value *V) {
@@ -806,54 +802,48 @@ void IRPromoter::Mutate(Type *OrigTy,
/// return value is zeroext. We don't allow opcodes that can introduce sign
/// bits.
bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
- if (auto *I = dyn_cast<ICmpInst>(V)) {
- // Now that we allow small types than TypeSize, only allow icmp of
- // TypeSize because they will require a trunc to be legalised.
- // TODO: Allow icmp of smaller types, and calculate at the end
- // whether the transform would be beneficial.
- if (isa<PointerType>(I->getOperand(0)->getType()))
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ switch (I->getOpcode()) {
+ default:
+ return isa<BinaryOperator>(I) && isSupportedType(I) &&
+ !GenerateSignBits(I);
+ case Instruction::GetElementPtr:
+ case Instruction::Store:
+ case Instruction::Br:
+ case Instruction::Switch:
return true;
- return EqualTypeSize(I->getOperand(0));
- }
-
- if (GenerateSignBits(V)) {
- LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n");
- return false;
- }
-
- // Memory instructions
- if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V))
- return true;
-
- // Branches and targets.
- if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V))
- return true;
-
- // Non-instruction values that we can handle.
- if ((isa<Constant>(V) && !isa<ConstantExpr>(V)) || isa<Argument>(V))
+ case Instruction::PHI:
+ case Instruction::Select:
+ case Instruction::Ret:
+ case Instruction::Load:
+ case Instruction::Trunc:
+ case Instruction::BitCast:
+ return isSupportedType(I);
+ case Instruction::ZExt:
+ return isSupportedType(I->getOperand(0));
+ case Instruction::ICmp:
+ // Now that we allow small types than TypeSize, only allow icmp of
+ // TypeSize because they will require a trunc to be legalised.
+ // TODO: Allow icmp of smaller types, and calculate at the end
+ // whether the transform would be beneficial.
+ if (isa<PointerType>(I->getOperand(0)->getType()))
+ return true;
+ return EqualTypeSize(I->getOperand(0));
+ case Instruction::Call: {
+ // Special cases for calls as we need to check for zeroext
+ // TODO We should accept calls even if they don't have zeroext, as they
+ // can still be sinks.
+ auto *Call = cast<CallInst>(I);
+ return isSupportedType(Call) &&
+ Call->hasRetAttr(Attribute::AttrKind::ZExt);
+ }
+ }
+ } else if (isa<Constant>(V) && !isa<ConstantExpr>(V)) {
return isSupportedType(V);
-
- if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V) ||
- isa<LoadInst>(V))
+ } else if (isa<Argument>(V))
return isSupportedType(V);
- if (auto *Cast = dyn_cast<CastInst>(V))
- return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0));
-
- // Special cases for calls as we need to check for zeroext
- // TODO We should accept calls even if they don't have zeroext, as they can
- // still be sinks.
- if (auto *Call = dyn_cast<CallInst>(V))
- return isSupportedType(Call) &&
- Call->hasRetAttr(Attribute::AttrKind::ZExt);
-
- if (!isa<BinaryOperator>(V))
- return false;
-
- if (!isSupportedType(V))
- return false;
-
- return true;
+ return isa<BasicBlock>(V);
}
/// Check that the type of V would be promoted and that the original type is
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 60e5d7bf6098..24ca25f73e96 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -26,8 +26,10 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -69,6 +71,7 @@ STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk");
STATISTIC(NumCBZ, "Number of CBZ / CBNZ formed");
STATISTIC(NumJTMoved, "Number of jump table destination blocks moved");
STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted");
+STATISTIC(NumLEInserted, "Number of LE backwards branches inserted");
static cl::opt<bool>
AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
@@ -212,6 +215,7 @@ namespace {
const ARMBaseInstrInfo *TII;
const ARMSubtarget *STI;
ARMFunctionInfo *AFI;
+ MachineDominatorTree *DT = nullptr;
bool isThumb;
bool isThumb1;
bool isThumb2;
@@ -224,6 +228,11 @@ namespace {
bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
@@ -238,7 +247,7 @@ namespace {
void doInitialJumpTablePlacement(std::vector<MachineInstr *> &CPEMIs);
bool BBHasFallthrough(MachineBasicBlock *MBB);
CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
- unsigned getCPELogAlign(const MachineInstr *CPEMI);
+ Align getCPEAlign(const MachineInstr *CPEMI);
void scanFunctionJumpTables();
void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
@@ -327,8 +336,7 @@ LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
const BasicBlockInfo &BBI = BBInfo[J];
dbgs() << format("%08x %bb.%u\t", BBI.Offset, J)
<< " kb=" << unsigned(BBI.KnownBits)
- << " ua=" << unsigned(BBI.Unalign)
- << " pa=" << unsigned(BBI.PostAlign)
+ << " ua=" << unsigned(BBI.Unalign) << " pa=" << Log2(BBI.PostAlign)
<< format(" size=%#x\n", BBInfo[J].Size);
}
});
@@ -349,6 +357,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
isPositionIndependentOrROPI =
STI->getTargetLowering()->isPositionIndependent() || STI->isROPI();
AFI = MF->getInfo<ARMFunctionInfo>();
+ DT = &getAnalysis<MachineDominatorTree>();
isThumb = AFI->isThumbFunction();
isThumb1 = AFI->isThumb1OnlyFunction();
@@ -357,9 +366,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
HasFarJump = false;
bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB);
- // This pass invalidates liveness information when it splits basic blocks.
- MF->getRegInfo().invalidateLiveness();
-
// Renumber all of the machine basic blocks in the function, guaranteeing that
// the numbers agree with the position of the block in the function.
MF->RenumberBlocks();
@@ -398,7 +404,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
// Functions with jump tables need an alignment of 4 because they use the ADR
// instruction, which aligns the PC to 4 bytes before adding an offset.
if (!T2JumpTables.empty())
- MF->ensureAlignment(2);
+ MF->ensureAlignment(Align(4));
/// Remove dead constant pool entries.
MadeChange |= removeUnusedCPEntries();
@@ -487,8 +493,9 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
MF->push_back(BB);
- // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
- unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+ // MachineConstantPool measures alignment in bytes.
+ const Align MaxAlign(MCP->getConstantPoolAlignment());
+ const unsigned MaxLogAlign = Log2(MaxAlign);
// Mark the basic block as required by the const-pool.
BB->setAlignment(MaxAlign);
@@ -501,7 +508,8 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
// alignment of all entries as long as BB is sufficiently aligned. Keep
// track of the insertion point for each alignment. We are going to bucket
// sort the entries as they are created.
- SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+ SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxLogAlign + 1,
+ BB->end());
// Add all of the constants from the constant pool to the end block, use an
// identity mapping of CPI's to CPE's.
@@ -526,7 +534,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
// Ensure that future entries with higher alignment get inserted before
// CPEMI. This is bucket sort with iterators.
- for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+ for (unsigned a = LogAlign + 1; a <= MaxLogAlign; ++a)
if (InsPoint[a] == InsAt)
InsPoint[a] = CPEMI;
@@ -640,29 +648,27 @@ ARMConstantIslands::findConstPoolEntry(unsigned CPI,
return nullptr;
}
-/// getCPELogAlign - Returns the required alignment of the constant pool entry
-/// represented by CPEMI. Alignment is measured in log2(bytes) units.
-unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
+/// getCPEAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.
+Align ARMConstantIslands::getCPEAlign(const MachineInstr *CPEMI) {
switch (CPEMI->getOpcode()) {
case ARM::CONSTPOOL_ENTRY:
break;
case ARM::JUMPTABLE_TBB:
- return isThumb1 ? 2 : 0;
+ return isThumb1 ? Align(4) : Align(1);
case ARM::JUMPTABLE_TBH:
- return isThumb1 ? 2 : 1;
+ return isThumb1 ? Align(4) : Align(2);
case ARM::JUMPTABLE_INSTS:
- return 1;
+ return Align(2);
case ARM::JUMPTABLE_ADDRS:
- return 2;
+ return Align(4);
default:
llvm_unreachable("unknown constpool entry kind");
}
unsigned CPI = getCombinedIndex(CPEMI);
assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
- unsigned Align = MCP->getConstants()[CPI].getAlignment();
- assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
- return Log2_32(Align);
+ return Align(MCP->getConstants()[CPI].getAlignment());
}
/// scanFunctionJumpTables - Do a scan of the function, building up
@@ -687,7 +693,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
BBInfoVector &BBInfo = BBUtils->getBBInfo();
// The known bits of the entry block offset are determined by the function
// alignment.
- BBInfo.front().KnownBits = MF->getAlignment();
+ BBInfo.front().KnownBits = Log2(MF->getAlignment());
// Compute block offsets and known bits.
BBUtils->adjustBBOffsetsAfter(&MF->front());
@@ -824,11 +830,6 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
Scale = 2; // +-(offset_8*2)
NegOk = true;
break;
-
- case ARM::tLDRHi:
- Bits = 5;
- Scale = 2; // +(offset_5*2)
- break;
}
// Remember that this is a user of a CP entry.
@@ -885,6 +886,13 @@ void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
MachineBasicBlock *OrigBB = MI->getParent();
+ // Collect liveness information at MI.
+ LivePhysRegs LRs(*MF->getSubtarget().getRegisterInfo());
+ LRs.addLiveOuts(*OrigBB);
+ auto LivenessEnd = ++MachineBasicBlock::iterator(MI).getReverse();
+ for (MachineInstr &LiveMI : make_range(OrigBB->rbegin(), LivenessEnd))
+ LRs.stepBackward(LiveMI);
+
// Create a new MBB for the code after the OrigBB.
MachineBasicBlock *NewBB =
MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
@@ -913,6 +921,12 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
// OrigBB branches to NewBB.
OrigBB->addSuccessor(NewBB);
+ // Update live-in information in the new block.
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ for (MCPhysReg L : LRs)
+ if (!MRI.isReserved(L))
+ NewBB->addLiveIn(L);
+
// Update internal data structures to account for the newly inserted MBB.
// This is almost the same as updateForInsertedWaterBlock, except that
// the Water goes after OrigBB, not NewBB.
@@ -1007,13 +1021,13 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
MachineBasicBlock* Water, CPUser &U,
unsigned &Growth) {
BBInfoVector &BBInfo = BBUtils->getBBInfo();
- unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
- unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
- unsigned NextBlockOffset, NextBlockAlignment;
+ const Align CPEAlign = getCPEAlign(U.CPEMI);
+ const unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPEAlign);
+ unsigned NextBlockOffset;
+ Align NextBlockAlignment;
MachineFunction::const_iterator NextBlock = Water->getIterator();
if (++NextBlock == MF->end()) {
NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
- NextBlockAlignment = 0;
} else {
NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
NextBlockAlignment = NextBlock->getAlignment();
@@ -1028,13 +1042,13 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
Growth = CPEEnd - NextBlockOffset;
// Compute the padding that would go at the end of the CPE to align the next
// block.
- Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment);
+ Growth += offsetToAlignment(CPEEnd, NextBlockAlignment);
// If the CPE is to be inserted before the instruction, that will raise
// the offset of the instruction. Also account for unknown alignment padding
// in blocks between CPE and the user.
if (CPEOffset < UserOffset)
- UserOffset += Growth + UnknownPadding(MF->getAlignment(), CPELogAlign);
+ UserOffset += Growth + UnknownPadding(MF->getAlignment(), Log2(CPEAlign));
} else
// CPE fits in existing padding.
Growth = 0;
@@ -1200,8 +1214,8 @@ bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
// inserting islands between BB0 and BB1 makes other accesses out of range.
MachineBasicBlock *UserBB = U.MI->getParent();
BBInfoVector &BBInfo = BBUtils->getBBInfo();
- unsigned MinNoSplitDisp =
- BBInfo[UserBB->getNumber()].postOffset(getCPELogAlign(U.CPEMI));
+ const Align CPEAlign = getCPEAlign(U.CPEMI);
+ unsigned MinNoSplitDisp = BBInfo[UserBB->getNumber()].postOffset(CPEAlign);
if (CloserWater && MinNoSplitDisp > U.getMaxDisp() / 2)
return false;
for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
@@ -1254,7 +1268,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
CPUser &U = CPUsers[CPUserIndex];
MachineInstr *UserMI = U.MI;
MachineInstr *CPEMI = U.CPEMI;
- unsigned CPELogAlign = getCPELogAlign(CPEMI);
+ const Align CPEAlign = getCPEAlign(CPEMI);
MachineBasicBlock *UserMBB = UserMI->getParent();
BBInfoVector &BBInfo = BBUtils->getBBInfo();
const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
@@ -1267,7 +1281,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
// Size of branch to insert.
unsigned Delta = isThumb1 ? 2 : 4;
// Compute the offset where the CPE will begin.
- unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+ unsigned CPEOffset = UserBBI.postOffset(CPEAlign) + Delta;
if (isOffsetInRange(UserOffset, CPEOffset, U)) {
LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
@@ -1308,11 +1322,11 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
// Try to split the block so it's fully aligned. Compute the latest split
// point where we can add a 4-byte branch instruction, and then align to
- // LogAlign which is the largest possible alignment in the function.
- unsigned LogAlign = MF->getAlignment();
- assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+ // Align which is the largest possible alignment in the function.
+ const Align Align = MF->getAlignment();
+ assert(Align >= CPEAlign && "Over-aligned constant pool entry");
unsigned KnownBits = UserBBI.internalKnownBits();
- unsigned UPad = UnknownPadding(LogAlign, KnownBits);
+ unsigned UPad = UnknownPadding(Align, KnownBits);
unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad;
LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x",
BaseInsertOffset));
@@ -1323,7 +1337,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
BaseInsertOffset -= 4;
LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
- << " la=" << LogAlign << " kb=" << KnownBits
+ << " la=" << Log2(Align) << " kb=" << KnownBits
<< " up=" << UPad << '\n');
// This could point off the end of the block if we've already got constant
@@ -1337,6 +1351,28 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
BaseInsertOffset =
std::max(UserBBI.postOffset() - UPad - 8,
UserOffset + TII->getInstSizeInBytes(*UserMI) + 1);
+ // If the CP is referenced(ie, UserOffset) is in first four instructions
+ // after IT, this recalculated BaseInsertOffset could be in the middle of
+ // an IT block. If it is, change the BaseInsertOffset to just after the
+ // IT block. This still make the CP Entry is in range becuase of the
+ // following reasons.
+ // 1. The initial BaseseInsertOffset calculated is (UserOffset +
+ // U.getMaxDisp() - UPad).
+ // 2. An IT block is only at most 4 instructions plus the "it" itself (18
+ // bytes).
+ // 3. All the relevant instructions support much larger Maximum
+ // displacement.
+ MachineBasicBlock::iterator I = UserMI;
+ ++I;
+ for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI),
+ PredReg = 0;
+ I->getOpcode() != ARM::t2IT &&
+ getITInstrPredicate(*I, PredReg) != ARMCC::AL;
+ Offset += TII->getInstSizeInBytes(*I), I = std::next(I)) {
+ BaseInsertOffset =
+ std::max(BaseInsertOffset, Offset + TII->getInstSizeInBytes(*I) + 1);
+ assert(I != UserMBB->end() && "Fell off end of block");
+ }
LLVM_DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
}
unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
@@ -1354,8 +1390,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
CPUser &U = CPUsers[CPUIndex];
if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
// Shift intertion point by one unit of alignment so it is within reach.
- BaseInsertOffset -= 1u << LogAlign;
- EndInsertOffset -= 1u << LogAlign;
+ BaseInsertOffset -= Align.value();
+ EndInsertOffset -= Align.value();
}
// This is overly conservative, as we don't account for CPEMIs being
// reused within the block, but it doesn't matter much. Also assume CPEs
@@ -1397,9 +1433,10 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
}
// We really must not split an IT block.
- LLVM_DEBUG(unsigned PredReg; assert(
- !isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL));
-
+#ifndef NDEBUG
+ unsigned PredReg;
+ assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL);
+#endif
NewMBB = splitBlockBeforeInstr(&*MI);
}
@@ -1464,9 +1501,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
// Always align the new block because CP entries can be smaller than 4
// bytes. Be careful not to decrease the existing alignment, e.g. NewMBB may
// be an already aligned constant pool block.
- const unsigned Align = isThumb ? 1 : 2;
- if (NewMBB->getAlignment() < Align)
- NewMBB->setAlignment(Align);
+ const Align Alignment = isThumb ? Align(2) : Align(4);
+ if (NewMBB->getAlignment() < Alignment)
+ NewMBB->setAlignment(Alignment);
// Remove the original WaterList entry; we want subsequent insertions in
// this vicinity to go after the one we're about to insert. This
@@ -1495,7 +1532,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
decrementCPEReferenceCount(CPI, CPEMI);
// Mark the basic block as aligned as required by the const-pool entry.
- NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+ NewIsland->setAlignment(getCPEAlign(U.CPEMI));
// Increase the size of the island block to account for the new entry.
BBUtils->adjustBBSize(NewIsland, Size);
@@ -1529,10 +1566,11 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
BBInfo[CPEBB->getNumber()].Size = 0;
// This block no longer needs to be aligned.
- CPEBB->setAlignment(0);
- } else
+ CPEBB->setAlignment(Align::None());
+ } else {
// Entries are sorted by descending alignment, so realign from the front.
- CPEBB->setAlignment(getCPELogAlign(&*CPEBB->begin()));
+ CPEBB->setAlignment(getCPEAlign(&*CPEBB->begin()));
+ }
BBUtils->adjustBBOffsetsAfter(CPEBB);
// An island has only one predecessor BB and one successor BB. Check if
@@ -1620,7 +1658,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
// L2:
ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm();
CC = ARMCC::getOppositeCondition(CC);
- unsigned CCReg = MI->getOperand(2).getReg();
+ Register CCReg = MI->getOperand(2).getReg();
// If the branch is at the end of its MBB and that has a fall-through block,
// direct the updated conditional branch to the fall-through block. Otherwise,
@@ -1778,16 +1816,10 @@ bool ARMConstantIslands::optimizeThumb2Instructions() {
return MadeChange;
}
+
bool ARMConstantIslands::optimizeThumb2Branches() {
- bool MadeChange = false;
- // The order in which branches appear in ImmBranches is approximately their
- // order within the function body. By visiting later branches first, we reduce
- // the distance between earlier forward branches and their targets, making it
- // more likely that the cbn?z optimization, which can only apply to forward
- // branches, will succeed.
- for (unsigned i = ImmBranches.size(); i != 0; --i) {
- ImmBranch &Br = ImmBranches[i-1];
+ auto TryShrinkBranch = [this](ImmBranch &Br) {
unsigned Opcode = Br.MI->getOpcode();
unsigned NewOpc = 0;
unsigned Scale = 1;
@@ -1815,47 +1847,115 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
BBUtils->adjustBBSize(MBB, -2);
BBUtils->adjustBBOffsetsAfter(MBB);
++NumT2BrShrunk;
- MadeChange = true;
+ return true;
}
}
+ return false;
+ };
- Opcode = Br.MI->getOpcode();
- if (Opcode != ARM::tBcc)
- continue;
+ struct ImmCompare {
+ MachineInstr* MI = nullptr;
+ unsigned NewOpc = 0;
+ };
+
+ auto FindCmpForCBZ = [this](ImmBranch &Br, ImmCompare &ImmCmp,
+ MachineBasicBlock *DestBB) {
+ ImmCmp.MI = nullptr;
+ ImmCmp.NewOpc = 0;
// If the conditional branch doesn't kill CPSR, then CPSR can be liveout
// so this transformation is not safe.
if (!Br.MI->killsRegister(ARM::CPSR))
- continue;
+ return false;
- NewOpc = 0;
unsigned PredReg = 0;
+ unsigned NewOpc = 0;
ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg);
if (Pred == ARMCC::EQ)
NewOpc = ARM::tCBZ;
else if (Pred == ARMCC::NE)
NewOpc = ARM::tCBNZ;
- if (!NewOpc)
- continue;
- MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+ else
+ return false;
+
// Check if the distance is within 126. Subtract starting offset by 2
// because the cmp will be eliminated.
unsigned BrOffset = BBUtils->getOffsetOf(Br.MI) + 4 - 2;
BBInfoVector &BBInfo = BBUtils->getBBInfo();
unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
if (BrOffset >= DestOffset || (DestOffset - BrOffset) > 126)
- continue;
+ return false;
// Search backwards to find a tCMPi8
auto *TRI = STI->getRegisterInfo();
MachineInstr *CmpMI = findCMPToFoldIntoCBZ(Br.MI, TRI);
if (!CmpMI || CmpMI->getOpcode() != ARM::tCMPi8)
+ return false;
+
+ ImmCmp.MI = CmpMI;
+ ImmCmp.NewOpc = NewOpc;
+ return true;
+ };
+
+ auto TryConvertToLE = [this](ImmBranch &Br, ImmCompare &Cmp) {
+ if (Br.MI->getOpcode() != ARM::t2Bcc || !STI->hasLOB() ||
+ STI->hasMinSize())
+ return false;
+
+ MachineBasicBlock *MBB = Br.MI->getParent();
+ MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+ if (BBUtils->getOffsetOf(MBB) < BBUtils->getOffsetOf(DestBB) ||
+ !BBUtils->isBBInRange(Br.MI, DestBB, 4094))
+ return false;
+
+ if (!DT->dominates(DestBB, MBB))
+ return false;
+
+ // We queried for the CBN?Z opcode based upon the 'ExitBB', the opposite
+ // target of Br. So now we need to reverse the condition.
+ Cmp.NewOpc = Cmp.NewOpc == ARM::tCBZ ? ARM::tCBNZ : ARM::tCBZ;
+
+ MachineInstrBuilder MIB = BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(),
+ TII->get(ARM::t2LE));
+ MIB.add(Br.MI->getOperand(0));
+ Br.MI->eraseFromParent();
+ Br.MI = MIB;
+ ++NumLEInserted;
+ return true;
+ };
+
+ bool MadeChange = false;
+
+ // The order in which branches appear in ImmBranches is approximately their
+ // order within the function body. By visiting later branches first, we reduce
+ // the distance between earlier forward branches and their targets, making it
+ // more likely that the cbn?z optimization, which can only apply to forward
+ // branches, will succeed.
+ for (ImmBranch &Br : reverse(ImmBranches)) {
+ MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+ MachineBasicBlock *MBB = Br.MI->getParent();
+ MachineBasicBlock *ExitBB = &MBB->back() == Br.MI ?
+ MBB->getFallThrough() :
+ MBB->back().getOperand(0).getMBB();
+
+ ImmCompare Cmp;
+ if (FindCmpForCBZ(Br, Cmp, ExitBB) && TryConvertToLE(Br, Cmp)) {
+ DestBB = ExitBB;
+ MadeChange = true;
+ } else {
+ FindCmpForCBZ(Br, Cmp, DestBB);
+ MadeChange |= TryShrinkBranch(Br);
+ }
+
+ unsigned Opcode = Br.MI->getOpcode();
+ if ((Opcode != ARM::tBcc && Opcode != ARM::t2LE) || !Cmp.NewOpc)
continue;
- unsigned Reg = CmpMI->getOperand(0).getReg();
+ Register Reg = Cmp.MI->getOperand(0).getReg();
// Check for Kill flags on Reg. If they are present remove them and set kill
// on the new CBZ.
+ auto *TRI = STI->getRegisterInfo();
MachineBasicBlock::iterator KillMI = Br.MI;
bool RegKilled = false;
do {
@@ -1865,19 +1965,32 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
RegKilled = true;
break;
}
- } while (KillMI != CmpMI);
+ } while (KillMI != Cmp.MI);
// Create the new CBZ/CBNZ
- MachineBasicBlock *MBB = Br.MI->getParent();
- LLVM_DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
+ LLVM_DEBUG(dbgs() << "Fold: " << *Cmp.MI << " and: " << *Br.MI);
MachineInstr *NewBR =
- BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(NewOpc))
+ BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(Cmp.NewOpc))
.addReg(Reg, getKillRegState(RegKilled))
.addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags());
- CmpMI->eraseFromParent();
- Br.MI->eraseFromParent();
- Br.MI = NewBR;
+
+ Cmp.MI->eraseFromParent();
+ BBInfoVector &BBInfo = BBUtils->getBBInfo();
BBInfo[MBB->getNumber()].Size -= 2;
+
+ if (Br.MI->getOpcode() == ARM::tBcc) {
+ Br.MI->eraseFromParent();
+ Br.MI = NewBR;
+ } else if (&MBB->back() != Br.MI) {
+ // We've generated an LE and already erased the original conditional
+ // branch. The CBN?Z is now used to branch to the other successor, so an
+ // unconditional branch terminator is now redundant.
+ MachineInstr *LastMI = &MBB->back();
+ if (LastMI != Br.MI) {
+ BBInfo[MBB->getNumber()].Size -= LastMI->getDesc().getSize();
+ LastMI->eraseFromParent();
+ }
+ }
BBUtils->adjustBBOffsetsAfter(MBB);
++NumCBZ;
MadeChange = true;
@@ -1931,8 +2044,8 @@ bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI,
// of BaseReg, but only if the t2ADDrs can be removed.
// + Some instruction other than t2ADDrs computing the entry. Not seen in
// the wild, but we should be careful.
- unsigned EntryReg = JumpMI->getOperand(0).getReg();
- unsigned BaseReg = LEAMI->getOperand(0).getReg();
+ Register EntryReg = JumpMI->getOperand(0).getReg();
+ Register BaseReg = LEAMI->getOperand(0).getReg();
CanDeleteLEA = true;
BaseRegKill = false;
@@ -2009,7 +2122,7 @@ static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI,
// but the JT now uses PC. Finds the last ADD (if any) that def's EntryReg
// and is not clobbered / used.
MachineInstr *RemovableAdd = nullptr;
- unsigned EntryReg = JumpMI->getOperand(0).getReg();
+ Register EntryReg = JumpMI->getOperand(0).getReg();
// Find the last ADD to set EntryReg
MachineBasicBlock::iterator I(LEAMI);
@@ -2106,7 +2219,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
// %idx = tLSLri %idx, 2
// %base = tLEApcrelJT
// %t = tLDRr %base, %idx
- unsigned BaseReg = User.MI->getOperand(0).getReg();
+ Register BaseReg = User.MI->getOperand(0).getReg();
if (User.MI->getIterator() == User.MI->getParent()->begin())
continue;
@@ -2116,7 +2229,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
!Shift->getOperand(2).isKill())
continue;
IdxReg = Shift->getOperand(2).getReg();
- unsigned ShiftedIdxReg = Shift->getOperand(0).getReg();
+ Register ShiftedIdxReg = Shift->getOperand(0).getReg();
// It's important that IdxReg is live until the actual TBB/TBH. Most of
// the range is checked later, but the LEA might still clobber it and not
@@ -2313,6 +2426,10 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
MachineFunction::iterator MBBI = ++JTBB->getIterator();
MF->insert(MBBI, NewBB);
+ // Copy live-in information to new block.
+ for (const MachineBasicBlock::RegisterMaskPair &RegMaskPair : BB->liveins())
+ NewBB->addLiveIn(RegMaskPair);
+
// Add an unconditional branch from NewBB to BB.
// There doesn't seem to be meaningful DebugInfo available; this doesn't
// correspond directly to anything in the source.
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 3bdb0e1ef62d..72c95f441265 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -17,6 +17,7 @@
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index b32ba3eeea18..563fdda56104 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -481,7 +481,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
unsigned OpIdx = 0;
bool DstIsDead = MI.getOperand(OpIdx).isDead();
- unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+ Register DstReg = MI.getOperand(OpIdx++).getReg();
if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 ||
TableEntry->RealOpc == ARM::VLD2DUPd16x2 ||
TableEntry->RealOpc == ARM::VLD2DUPd32x2) {
@@ -492,7 +492,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
assert(RegSpc == OddDblSpc && "Unexpected spacing!");
SubRegIndex = ARM::dsub_1;
}
- unsigned SubReg = TRI->getSubReg(DstReg, SubRegIndex);
+ Register SubReg = TRI->getSubReg(DstReg, SubRegIndex);
unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0,
&ARM::DPairSpcRegClass);
MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead));
@@ -624,7 +624,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
bool SrcIsKill = MI.getOperand(OpIdx).isKill();
bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
- unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+ Register SrcReg = MI.getOperand(OpIdx++).getReg();
unsigned D0, D1, D2, D3;
GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3);
MIB.addReg(D0, getUndefRegState(SrcIsUndef));
@@ -760,7 +760,7 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
}
bool SrcIsKill = MI.getOperand(OpIdx).isKill();
- unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+ Register SrcReg = MI.getOperand(OpIdx++).getReg();
unsigned D0, D1, D2, D3;
GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3);
MIB.addReg(D0);
@@ -789,6 +789,7 @@ static bool IsAnAddressOperand(const MachineOperand &MO) {
case MachineOperand::MO_Immediate:
case MachineOperand::MO_CImmediate:
case MachineOperand::MO_FPImmediate:
+ case MachineOperand::MO_ShuffleMask:
return false;
case MachineOperand::MO_MachineBasicBlock:
return true;
@@ -828,7 +829,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
unsigned Opcode = MI.getOpcode();
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1);
@@ -932,13 +933,13 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
const MachineOperand &Dest = MI.getOperand(0);
- unsigned TempReg = MI.getOperand(1).getReg();
+ Register TempReg = MI.getOperand(1).getReg();
// Duplicating undef operands into 2 instructions does not guarantee the same
// value on both; However undef should be replaced by xzr anyway.
assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
- unsigned AddrReg = MI.getOperand(2).getReg();
- unsigned DesiredReg = MI.getOperand(3).getReg();
- unsigned NewReg = MI.getOperand(4).getReg();
+ Register AddrReg = MI.getOperand(2).getReg();
+ Register DesiredReg = MI.getOperand(3).getReg();
+ Register NewReg = MI.getOperand(4).getReg();
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -1035,8 +1036,8 @@ static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
unsigned Flags, bool IsThumb,
const TargetRegisterInfo *TRI) {
if (IsThumb) {
- unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
- unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
+ Register RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
+ Register RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
MIB.addReg(RegLo, Flags);
MIB.addReg(RegHi, Flags);
} else
@@ -1051,19 +1052,19 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
MachineOperand &Dest = MI.getOperand(0);
- unsigned TempReg = MI.getOperand(1).getReg();
+ Register TempReg = MI.getOperand(1).getReg();
// Duplicating undef operands into 2 instructions does not guarantee the same
// value on both; However undef should be replaced by xzr anyway.
assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
- unsigned AddrReg = MI.getOperand(2).getReg();
- unsigned DesiredReg = MI.getOperand(3).getReg();
+ Register AddrReg = MI.getOperand(2).getReg();
+ Register DesiredReg = MI.getOperand(3).getReg();
MachineOperand New = MI.getOperand(4);
New.setIsKill(false);
- unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
- unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
- unsigned DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0);
- unsigned DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1);
+ Register DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
+ Register DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
+ Register DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0);
+ Register DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1);
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -1204,8 +1205,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
NewMI->addOperand(MBBI->getOperand(i));
- // Delete the pseudo instruction TCRETURN.
+
+ // Update call site info and delete the pseudo instruction TCRETURN.
+ MBB.getParent()->moveCallSiteInfo(&MI, &*NewMI);
MBB.erase(MBBI);
+
MBBI = NewMI;
return true;
}
@@ -1336,7 +1340,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// for us. Otherwise, expand to nothing.
if (RI.hasBasePointer(MF)) {
int32_t NumBytes = AFI->getFramePtrSpillOffset();
- unsigned FramePtr = RI.getFrameRegister(MF);
+ Register FramePtr = RI.getFrameRegister(MF);
assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) &&
"base pointer without frame pointer?");
@@ -1412,7 +1416,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MachineConstantPoolValue *CPV =
ARMConstantPoolSymbol::Create(MF->getFunction().getContext(),
"__aeabi_read_tp", PCLabelID, 0);
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg)
.addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
@@ -1435,6 +1439,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MIB.cloneMemRefs(MI);
TransferImpOps(MI, MIB, MIB);
+ MI.getMF()->moveCallSiteInfo(&MI, &*MIB);
MI.eraseFromParent();
return true;
}
@@ -1442,7 +1447,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::t2LDRpci_pic: {
unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
? ARM::tLDRpci : ARM::t2LDRpci;
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
MachineInstrBuilder MIB1 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg)
@@ -1464,7 +1469,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::LDRLIT_ga_pcrel_ldr:
case ARM::tLDRLIT_ga_abs:
case ARM::tLDRLIT_ga_pcrel: {
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
const MachineOperand &MO1 = MI.getOperand(1);
auto Flags = MO1.getTargetFlags();
@@ -1522,7 +1527,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::t2MOV_ga_pcrel: {
// Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode.
unsigned LabelId = AFI->createPICLabelUId();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
const MachineOperand &MO1 = MI.getOperand(1);
const GlobalValue *GV = MO1.getGlobal();
@@ -1586,7 +1591,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// Grab the Q register destination.
bool DstIsDead = MI.getOperand(OpIdx).isDead();
- unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+ Register DstReg = MI.getOperand(OpIdx++).getReg();
// Copy the source register.
MIB.add(MI.getOperand(OpIdx++));
@@ -1596,8 +1601,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MIB.add(MI.getOperand(OpIdx++));
// Add the destination operands (D subregs).
- unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
- unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
+ Register D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
+ Register D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead))
.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
@@ -1617,7 +1622,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// Grab the Q register source.
bool SrcIsKill = MI.getOperand(OpIdx).isKill();
- unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+ Register SrcReg = MI.getOperand(OpIdx++).getReg();
// Copy the destination register.
MachineOperand Dst(MI.getOperand(OpIdx++));
@@ -1628,8 +1633,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MIB.add(MI.getOperand(OpIdx++));
// Add the source operands (D subregs).
- unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
- unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
+ Register D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
+ Register D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
MIB.addReg(D0, SrcIsKill ? RegState::Kill : 0)
.addReg(D1, SrcIsKill ? RegState::Kill : 0);
@@ -1915,6 +1920,37 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::CMP_SWAP_64:
return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI);
+
+ case ARM::tBL_PUSHLR:
+ case ARM::BL_PUSHLR: {
+ const bool Thumb = Opcode == ARM::tBL_PUSHLR;
+ Register Reg = MI.getOperand(0).getReg();
+ assert(Reg == ARM::LR && "expect LR register!");
+ MachineInstrBuilder MIB;
+ if (Thumb) {
+ // push {lr}
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPUSH))
+ .add(predOps(ARMCC::AL))
+ .addReg(Reg);
+
+ // bl __gnu_mcount_nc
+ MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tBL));
+ } else {
+ // stmdb sp!, {lr}
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::STMDB_UPD))
+ .addReg(ARM::SP, RegState::Define)
+ .addReg(ARM::SP)
+ .add(predOps(ARMCC::AL))
+ .addReg(Reg);
+
+ // bl __gnu_mcount_nc
+ MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::BL));
+ }
+ MIB.cloneMemRefs(MI);
+ for (unsigned i = 1; i < MI.getNumOperands(); ++i) MIB.add(MI.getOperand(i));
+ MI.eraseFromParent();
+ return true;
+ }
}
}
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 6e274d269bf2..1fc5ff6921c6 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -191,8 +191,8 @@ class ARMFastISel final : public FastISel {
bool isTypeLegal(Type *Ty, MVT &VT);
bool isLoadTypeLegal(Type *Ty, MVT &VT);
bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
- bool isZExt, bool isEquality);
- bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+ bool isZExt);
+ bool ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
unsigned Alignment = 0, bool isZExt = true,
bool allocReg = true);
bool ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
@@ -219,15 +219,15 @@ class ARMFastISel final : public FastISel {
bool Return,
bool isVarArg);
bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
- SmallVectorImpl<unsigned> &ArgRegs,
+ SmallVectorImpl<Register> &ArgRegs,
SmallVectorImpl<MVT> &ArgVTs,
SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
- SmallVectorImpl<unsigned> &RegArgs,
+ SmallVectorImpl<Register> &RegArgs,
CallingConv::ID CC,
unsigned &NumBytes,
bool isVarArg);
unsigned getLibcallReg(const Twine &Name);
- bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+ bool FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs,
const Instruction *I, CallingConv::ID CC,
unsigned &NumBytes, bool isVarArg);
bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call);
@@ -301,7 +301,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
unsigned ARMFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
unsigned Op0, bool Op0IsKill) {
- unsigned ResultReg = createResultReg(RC);
+ Register ResultReg = createResultReg(RC);
const MCInstrDesc &II = TII.get(MachineInstOpcode);
// Make sure the input operand is sufficiently constrained to be legal
@@ -913,7 +913,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
AddOptionalDefs(MIB);
}
-bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+bool ARMFastISel::ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
unsigned Alignment, bool isZExt, bool allocReg) {
unsigned Opc;
bool useAM3 = false;
@@ -1045,7 +1045,7 @@ bool ARMFastISel::SelectLoad(const Instruction *I) {
Address Addr;
if (!ARMComputeAddress(I->getOperand(0), Addr)) return false;
- unsigned ResultReg;
+ Register ResultReg;
if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
return false;
updateValueMap(I, ResultReg);
@@ -1259,8 +1259,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
if (ARMPred == ARMCC::AL) return false;
// Emit the compare.
- if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
- CI->isEquality()))
+ if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
return false;
unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
@@ -1349,7 +1348,7 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
}
bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
- bool isZExt, bool isEquality) {
+ bool isZExt) {
Type *Ty = Src1Value->getType();
EVT SrcEVT = TLI.getValueType(DL, Ty, true);
if (!SrcEVT.isSimple()) return false;
@@ -1397,19 +1396,11 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
// TODO: Verify compares.
case MVT::f32:
isICmp = false;
- // Equality comparisons shouldn't raise Invalid on uordered inputs.
- if (isEquality)
- CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS;
- else
- CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
+ CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS;
break;
case MVT::f64:
isICmp = false;
- // Equality comparisons shouldn't raise Invalid on uordered inputs.
- if (isEquality)
- CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD;
- else
- CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED;
+ CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD;
break;
case MVT::i1:
case MVT::i8:
@@ -1485,8 +1476,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
if (ARMPred == ARMCC::AL) return false;
// Emit the compare.
- if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
- CI->isEquality()))
+ if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
return false;
// Now set a register based on the comparison. Explicitly set the predicates
@@ -1893,10 +1883,10 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
}
bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
- SmallVectorImpl<unsigned> &ArgRegs,
+ SmallVectorImpl<Register> &ArgRegs,
SmallVectorImpl<MVT> &ArgVTs,
SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
- SmallVectorImpl<unsigned> &RegArgs,
+ SmallVectorImpl<Register> &RegArgs,
CallingConv::ID CC,
unsigned &NumBytes,
bool isVarArg) {
@@ -1960,7 +1950,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
const Value *ArgVal = Args[VA.getValNo()];
- unsigned Arg = ArgRegs[VA.getValNo()];
+ Register Arg = ArgRegs[VA.getValNo()];
MVT ArgVT = ArgVTs[VA.getValNo()];
assert((!ArgVT.isVector() && ArgVT.getSizeInBits() <= 64) &&
@@ -2039,7 +2029,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
return true;
}
-bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs,
const Instruction *I, CallingConv::ID CC,
unsigned &NumBytes, bool isVarArg) {
// Issue CALLSEQ_END
@@ -2060,7 +2050,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
// double fp reg we want.
MVT DestVT = RVLocs[0].getValVT();
const TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT);
- unsigned ResultReg = createResultReg(DstRC);
+ Register ResultReg = createResultReg(DstRC);
AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(ARM::VMOVDRR), ResultReg)
.addReg(RVLocs[0].getLocReg())
@@ -2081,7 +2071,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
const TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
- unsigned ResultReg = createResultReg(DstRC);
+ Register ResultReg = createResultReg(DstRC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY),
ResultReg).addReg(RVLocs[0].getLocReg());
@@ -2162,7 +2152,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
}
// Make the copy.
- unsigned DstReg = VA.getLocReg();
+ Register DstReg = VA.getLocReg();
const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
// Avoid a cross-class copy. This is very unlikely.
if (!SrcRC->contains(DstReg))
@@ -2231,7 +2221,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
// Set up the argument vectors.
SmallVector<Value*, 8> Args;
- SmallVector<unsigned, 8> ArgRegs;
+ SmallVector<Register, 8> ArgRegs;
SmallVector<MVT, 8> ArgVTs;
SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
Args.reserve(I->getNumOperands());
@@ -2247,8 +2237,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
if (!isTypeLegal(ArgTy, ArgVT)) return false;
ISD::ArgFlagsTy Flags;
- unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
- Flags.setOrigAlign(OriginalAlignment);
+ Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy)));
Args.push_back(Op);
ArgRegs.push_back(Arg);
@@ -2257,13 +2246,13 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
}
// Handle the arguments now that we've gotten them.
- SmallVector<unsigned, 4> RegArgs;
+ SmallVector<Register, 4> RegArgs;
unsigned NumBytes;
if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
RegArgs, CC, NumBytes, false))
return false;
- unsigned CalleeReg = 0;
+ Register CalleeReg;
if (Subtarget->genLongCalls()) {
CalleeReg = getLibcallReg(TLI.getLibcallName(Call));
if (CalleeReg == 0) return false;
@@ -2282,7 +2271,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
MIB.addExternalSymbol(TLI.getLibcallName(Call));
// Add implicit physical register uses to the call.
- for (unsigned R : RegArgs)
+ for (Register R : RegArgs)
MIB.addReg(R, RegState::Implicit);
// Add a register mask with the call-preserved registers.
@@ -2290,7 +2279,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
// Finish off the call including any return values.
- SmallVector<unsigned, 4> UsedRegs;
+ SmallVector<Register, 4> UsedRegs;
if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false;
// Set all unused physreg defs as dead.
@@ -2340,7 +2329,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
// Set up the argument vectors.
SmallVector<Value*, 8> Args;
- SmallVector<unsigned, 8> ArgRegs;
+ SmallVector<Register, 8> ArgRegs;
SmallVector<MVT, 8> ArgVTs;
SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
unsigned arg_size = CS.arg_size();
@@ -2377,12 +2366,11 @@ bool ARMFastISel::SelectCall(const Instruction *I,
ArgVT != MVT::i1)
return false;
- unsigned Arg = getRegForValue(*i);
- if (Arg == 0)
+ Register Arg = getRegForValue(*i);
+ if (!Arg.isValid())
return false;
- unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
- Flags.setOrigAlign(OriginalAlignment);
+ Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy)));
Args.push_back(*i);
ArgRegs.push_back(Arg);
@@ -2391,7 +2379,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
}
// Handle the arguments now that we've gotten them.
- SmallVector<unsigned, 4> RegArgs;
+ SmallVector<Register, 4> RegArgs;
unsigned NumBytes;
if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
RegArgs, CC, NumBytes, isVarArg))
@@ -2401,7 +2389,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
if (!GV || Subtarget->genLongCalls()) UseReg = true;
- unsigned CalleeReg = 0;
+ Register CalleeReg;
if (UseReg) {
if (IntrMemName)
CalleeReg = getLibcallReg(IntrMemName);
@@ -2427,7 +2415,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
MIB.addExternalSymbol(IntrMemName, 0);
// Add implicit physical register uses to the call.
- for (unsigned R : RegArgs)
+ for (Register R : RegArgs)
MIB.addReg(R, RegState::Implicit);
// Add a register mask with the call-preserved registers.
@@ -2435,7 +2423,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
// Finish off the call including any return values.
- SmallVector<unsigned, 4> UsedRegs;
+ SmallVector<Register, 4> UsedRegs;
if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg))
return false;
@@ -2476,7 +2464,7 @@ bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
}
bool RV;
- unsigned ResultReg;
+ Register ResultReg;
RV = ARMEmitLoad(VT, ResultReg, Src);
assert(RV && "Should be able to handle this load.");
RV = ARMEmitStore(VT, ResultReg, Dest);
@@ -2506,7 +2494,7 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
const ARMBaseRegisterInfo *RegInfo =
static_cast<const ARMBaseRegisterInfo *>(Subtarget->getRegisterInfo());
- unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+ Register FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
unsigned SrcReg = FramePtr;
// Recursively load frame address
@@ -2947,7 +2935,7 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
Address Addr;
if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
- unsigned ResultReg = MI->getOperand(0).getReg();
+ Register ResultReg = MI->getOperand(0).getReg();
if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
return false;
MachineBasicBlock::iterator I(MI);
@@ -2974,7 +2962,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
MachineMemOperand::MOLoad, 4, 4);
- unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
+ Register TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp;
MachineInstrBuilder MIB =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg)
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index bedb779bcba0..01ae93086dcb 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -76,7 +76,7 @@ skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
unsigned NumAlignedDPRCS2Regs);
ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti)
- : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4),
+ : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, Align(4)),
STI(sti) {}
bool ARMFrameLowering::keepFramePointer(const MachineFunction &MF) const {
@@ -376,7 +376,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
// to determine the end of the prologue.
DebugLoc dl;
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ Register FramePtr = RegInfo->getFrameRegister(MF);
// Determine the sizes of each callee-save spill areas and record which frame
// belongs to which callee-save spill areas.
@@ -780,7 +780,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
int NumBytes = (int)MFI.getStackSize();
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ Register FramePtr = RegInfo->getFrameRegister(MF);
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
@@ -1503,11 +1503,17 @@ static unsigned EstimateFunctionSizeInBytes(const MachineFunction &MF,
/// instructions will require a scratch register during their expansion later.
// FIXME: Move to TII?
static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
- const TargetFrameLowering *TFI) {
+ const TargetFrameLowering *TFI,
+ bool &HasNonSPFrameIndex) {
const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+ const ARMBaseInstrInfo &TII =
+ *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
unsigned Limit = (1 << 12) - 1;
for (auto &MBB : MF) {
for (auto &MI : MBB) {
+ if (MI.isDebugInstr())
+ continue;
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
if (!MI.getOperand(i).isFI())
continue;
@@ -1518,13 +1524,29 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
Limit = std::min(Limit, (1U << 8) - 1);
break;
}
+ // t2ADDri will not require an extra register, it can reuse the
+ // destination.
+ if (MI.getOpcode() == ARM::t2ADDri || MI.getOpcode() == ARM::t2ADDri12)
+ break;
+
+ const MCInstrDesc &MCID = MI.getDesc();
+ const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI, MF);
+ if (RegClass && !RegClass->contains(ARM::SP))
+ HasNonSPFrameIndex = true;
// Otherwise check the addressing mode.
switch (MI.getDesc().TSFlags & ARMII::AddrModeMask) {
+ case ARMII::AddrMode_i12:
+ case ARMII::AddrMode2:
+ // Default 12 bit limit.
+ break;
case ARMII::AddrMode3:
case ARMII::AddrModeT2_i8:
Limit = std::min(Limit, (1U << 8) - 1);
break;
+ case ARMII::AddrMode5FP16:
+ Limit = std::min(Limit, ((1U << 8) - 1) * 2);
+ break;
case ARMII::AddrMode5:
case ARMII::AddrModeT2_i8s4:
case ARMII::AddrModeT2_ldrex:
@@ -1541,8 +1563,17 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
// Addressing modes 4 & 6 (load/store) instructions can't encode an
// immediate offset for stack references.
return 0;
- default:
+ case ARMII::AddrModeT2_i7:
+ Limit = std::min(Limit, ((1U << 7) - 1) * 1);
+ break;
+ case ARMII::AddrModeT2_i7s2:
+ Limit = std::min(Limit, ((1U << 7) - 1) * 2);
break;
+ case ARMII::AddrModeT2_i7s4:
+ Limit = std::min(Limit, ((1U << 7) - 1) * 4);
+ break;
+ default:
+ llvm_unreachable("Unhandled addressing mode in stack size limit calculation");
}
break; // At most one FI per instruction
}
@@ -1623,7 +1654,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
(void)TRI; // Silence unused warning in non-assert builds.
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ Register FramePtr = RegInfo->getFrameRegister(MF);
// Spill R4 if Thumb2 function requires stack realignment - it will be used as
// scratch register. Also spill R4 if Thumb2 function has varsized objects,
@@ -1784,6 +1815,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
EstimatedStackSize += 16; // For possible paddings.
unsigned EstimatedRSStackSizeLimit, EstimatedRSFixedSizeLimit;
+ bool HasNonSPFrameIndex = false;
if (AFI->isThumb1OnlyFunction()) {
// For Thumb1, don't bother to iterate over the function. The only
// instruction that requires an emergency spill slot is a store to a
@@ -1804,7 +1836,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
EstimatedRSStackSizeLimit = (1U << 8) * 4;
EstimatedRSFixedSizeLimit = (1U << 5) * 4;
} else {
- EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this);
+ EstimatedRSStackSizeLimit =
+ estimateRSStackSizeLimit(MF, this, HasNonSPFrameIndex);
EstimatedRSFixedSizeLimit = EstimatedRSStackSizeLimit;
}
// Final estimate of whether sp or bp-relative accesses might require
@@ -1830,12 +1863,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit;
bool BigFrameOffsets = HasLargeStack || !HasBPOrFixedSP ||
- HasLargeArgumentList;
+ HasLargeArgumentList || HasNonSPFrameIndex;
LLVM_DEBUG(dbgs() << "EstimatedLimit: " << EstimatedRSStackSizeLimit
- << "; EstimatedStack" << EstimatedStackSize
- << "; EstimatedFPStack" << MaxFixedOffset - MaxFPOffset
- << "; BigFrameOffsets: " << BigFrameOffsets
- << "\n");
+ << "; EstimatedStack: " << EstimatedStackSize
+ << "; EstimatedFPStack: " << MaxFixedOffset - MaxFPOffset
+ << "; BigFrameOffsets: " << BigFrameOffsets << "\n");
if (BigFrameOffsets ||
!CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
AFI->setHasStackFrame(true);
@@ -2080,9 +2112,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
ExtraCSSpill = true;
}
}
- if (!ExtraCSSpill) {
+ if (!ExtraCSSpill && RS) {
// Reserve a slot closest to SP or frame pointer.
- assert(RS && "Register scavenging not provided");
LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n");
const TargetRegisterClass &RC = ARM::GPRRegClass;
unsigned Size = TRI->getSpillSize(RC);
@@ -2097,6 +2128,12 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
AFI->setLRIsSpilledForFarJump(true);
}
AFI->setLRIsSpilled(SavedRegs.test(ARM::LR));
+
+ // If we have the "returned" parameter attribute which guarantees that we
+ // return the value which was passed in r0 unmodified (e.g. C++ 'structors),
+ // record that fact for IPRA.
+ if (AFI->getPreservesR0())
+ SavedRegs.set(ARM::R0);
}
MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 7544ca3c38d6..6d8aee597945 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -63,6 +63,11 @@ public:
bool enableShrinkWrapping(const MachineFunction &MF) const override {
return true;
}
+ bool isProfitableForNoCSROpt(const Function &F) const override {
+ // The no-CSR optimisation is bad for code size on ARM, because we can save
+ // many registers with a single PUSH/POP pair.
+ return false;
+ }
private:
void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index b349627b67b1..8f6515c423eb 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -139,6 +139,8 @@ public:
bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
SDValue &OffImm);
bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm);
+ template <unsigned Shift>
+ bool SelectTAddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm);
// Thumb 2 Addressing Modes:
bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
@@ -146,9 +148,12 @@ public:
SDValue &OffImm);
bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
SDValue &OffImm);
- template<unsigned Shift>
- bool SelectT2AddrModeImm7(SDValue N, SDValue &Base,
- SDValue &OffImm);
+ template <unsigned Shift>
+ bool SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm);
+ bool SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N, SDValue &OffImm,
+ unsigned Shift);
+ template <unsigned Shift>
+ bool SelectT2AddrModeImm7(SDValue N, SDValue &Base, SDValue &OffImm);
bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base,
SDValue &OffReg, SDValue &ShImm);
bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm);
@@ -179,6 +184,7 @@ private:
bool tryARMIndexedLoad(SDNode *N);
bool tryT1IndexedLoad(SDNode *N);
bool tryT2IndexedLoad(SDNode *N);
+ bool tryMVEIndexedLoad(SDNode *N);
/// SelectVLD - Select NEON load intrinsics. NumVecs should be
/// 1, 2, 3 or 4. The opcode arrays specify the instructions used for
@@ -246,10 +252,6 @@ private:
SDValue GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs,
bool is64BitVector);
- /// Returns the number of instructions required to materialize the given
- /// constant in a register, or 3 if a literal pool load is needed.
- unsigned ConstantMaterializationCost(unsigned Val) const;
-
/// Checks if N is a multiplication by a constant where we can extract out a
/// power of two from the constant so that it can be used in a shift, but only
/// if it simplifies the materialization of the constant. Returns true if it
@@ -450,27 +452,6 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
(ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
}
-unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
- if (Subtarget->isThumb()) {
- if (Val <= 255) return 1; // MOV
- if (Subtarget->hasV6T2Ops() &&
- (Val <= 0xffff || // MOV
- ARM_AM::getT2SOImmVal(Val) != -1 || // MOVW
- ARM_AM::getT2SOImmVal(~Val) != -1)) // MVN
- return 1;
- if (Val <= 510) return 2; // MOV + ADDi8
- if (~Val <= 255) return 2; // MOV + MVN
- if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL
- } else {
- if (ARM_AM::getSOImmVal(Val) != -1) return 1; // MOV
- if (ARM_AM::getSOImmVal(~Val) != -1) return 1; // MVN
- if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
- if (ARM_AM::isSOImmTwoPartVal(Val)) return 2; // two instrs
- }
- if (Subtarget->useMovt()) return 2; // MOVW + MOVT
- return 3; // Literal pool load
-}
-
bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
unsigned MaxShift,
unsigned &PowerOfTwo,
@@ -500,8 +481,8 @@ bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
// Only optimise if the new cost is better
unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo);
NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32);
- unsigned OldCost = ConstantMaterializationCost(MulConstVal);
- unsigned NewCost = ConstantMaterializationCost(NewMulConstVal);
+ unsigned OldCost = ConstantMaterializationCost(MulConstVal, Subtarget);
+ unsigned NewCost = ConstantMaterializationCost(NewMulConstVal, Subtarget);
return NewCost < OldCost;
}
@@ -1172,6 +1153,28 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
return false;
}
+template <unsigned Shift>
+bool ARMDAGToDAGISel::SelectTAddrModeImm7(SDValue N, SDValue &Base,
+ SDValue &OffImm) {
+ if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) {
+ int RHSC;
+ if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -0x7f, 0x80,
+ RHSC)) {
+ Base = N.getOperand(0);
+ if (N.getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+ OffImm =
+ CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+
+ // Base only.
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// Thumb 2 Addressing Modes
@@ -1278,35 +1281,59 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
return false;
}
-template<unsigned Shift>
-bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N,
- SDValue &Base, SDValue &OffImm) {
- if (N.getOpcode() == ISD::SUB ||
- CurDAG->isBaseWithConstantOffset(N)) {
- if (auto RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
- int RHSC = (int)RHS->getZExtValue();
- if (N.getOpcode() == ISD::SUB)
- RHSC = -RHSC;
-
- if (isShiftedInt<7, Shift>(RHSC)) {
- Base = N.getOperand(0);
- if (Base.getOpcode() == ISD::FrameIndex) {
- int FI = cast<FrameIndexSDNode>(Base)->getIndex();
- Base = CurDAG->getTargetFrameIndex(
+template <unsigned Shift>
+bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, SDValue &Base,
+ SDValue &OffImm) {
+ if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) {
+ int RHSC;
+ if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -0x7f, 0x80,
+ RHSC)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
FI, TLI->getPointerTy(CurDAG->getDataLayout()));
- }
- OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
- return true;
}
+
+ if (N.getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+ OffImm =
+ CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32);
+ return true;
}
}
// Base only.
Base = N;
- OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
return true;
}
+template <unsigned Shift>
+bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N,
+ SDValue &OffImm) {
+ return SelectT2AddrModeImm7Offset(Op, N, OffImm, Shift);
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N,
+ SDValue &OffImm,
+ unsigned Shift) {
+ unsigned Opcode = Op->getOpcode();
+ ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+ ? cast<LoadSDNode>(Op)->getAddressingMode()
+ : cast<StoreSDNode>(Op)->getAddressingMode();
+ int RHSC;
+ if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { // 7 bits.
+ OffImm =
+ ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
+ ? CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32)
+ : CurDAG->getTargetConstant(-RHSC * (1 << Shift), SDLoc(N),
+ MVT::i32);
+ return true;
+ }
+ return false;
+}
+
bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
SDValue &Base,
SDValue &OffReg, SDValue &ShImm) {
@@ -1565,6 +1592,68 @@ bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
return false;
}
+bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ if (AM == ISD::UNINDEXED)
+ return false;
+ EVT LoadedVT = LD->getMemoryVT();
+ if (!LoadedVT.isVector())
+ return false;
+ bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+ SDValue Offset;
+ bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+ unsigned Opcode = 0;
+ unsigned Align = LD->getAlignment();
+ bool IsLE = Subtarget->isLittle();
+
+ if (Align >= 2 && LoadedVT == MVT::v4i16 &&
+ SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) {
+ if (isSExtLd)
+ Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post;
+ else
+ Opcode = isPre ? ARM::MVE_VLDRHU32_pre : ARM::MVE_VLDRHU32_post;
+ } else if (LoadedVT == MVT::v8i8 &&
+ SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) {
+ if (isSExtLd)
+ Opcode = isPre ? ARM::MVE_VLDRBS16_pre : ARM::MVE_VLDRBS16_post;
+ else
+ Opcode = isPre ? ARM::MVE_VLDRBU16_pre : ARM::MVE_VLDRBU16_post;
+ } else if (LoadedVT == MVT::v4i8 &&
+ SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) {
+ if (isSExtLd)
+ Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post;
+ else
+ Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post;
+ } else if (Align >= 4 &&
+ (IsLE || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) &&
+ SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 2))
+ Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post;
+ else if (Align >= 2 &&
+ (IsLE || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) &&
+ SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1))
+ Opcode = isPre ? ARM::MVE_VLDRHU16_pre : ARM::MVE_VLDRHU16_post;
+ else if ((IsLE || LoadedVT == MVT::v16i8) &&
+ SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0))
+ Opcode = isPre ? ARM::MVE_VLDRBU8_pre : ARM::MVE_VLDRBU8_post;
+ else
+ return false;
+
+ SDValue Chain = LD->getChain();
+ SDValue Base = LD->getBasePtr();
+ SDValue Ops[] = {Base, Offset,
+ CurDAG->getTargetConstant(ARMVCC::None, SDLoc(N), MVT::i32),
+ CurDAG->getRegister(0, MVT::i32), Chain};
+ SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), LD->getValueType(0),
+ MVT::i32, MVT::Other, Ops);
+ transferMemOperands(N, New);
+ ReplaceUses(SDValue(N, 0), SDValue(New, 1));
+ ReplaceUses(SDValue(N, 1), SDValue(New, 0));
+ ReplaceUses(SDValue(N, 2), SDValue(New, 2));
+ CurDAG->RemoveDeadNode(N);
+ return true;
+}
+
/// Form a GPRPair pseudo register from a pair of GPR regs.
SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
SDLoc dl(V0.getNode());
@@ -2701,7 +2790,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
case ISD::Constant: {
unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
// If we can't materialize the constant we need to use a literal pool
- if (ConstantMaterializationCost(Val) > 2) {
+ if (ConstantMaterializationCost(Val, Subtarget) > 2) {
SDValue CPIdx = CurDAG->getTargetConstantPool(
ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
TLI->getPointerTy(CurDAG->getDataLayout()));
@@ -2842,8 +2931,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
bool PreferImmediateEncoding =
Subtarget->hasThumb2() && (is_t2_so_imm(Imm) || is_t2_so_imm_not(Imm));
if (!PreferImmediateEncoding &&
- ConstantMaterializationCost(Imm) >
- ConstantMaterializationCost(~Imm)) {
+ ConstantMaterializationCost(Imm, Subtarget) >
+ ConstantMaterializationCost(~Imm, Subtarget)) {
// The current immediate costs more to materialize than a negated
// immediate, so negate the immediate and use a BIC.
SDValue NewImm =
@@ -2987,6 +3076,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
return;
}
case ISD::LOAD: {
+ if (Subtarget->hasMVEIntegerOps() && tryMVEIndexedLoad(N))
+ return;
if (Subtarget->isThumb() && Subtarget->hasThumb2()) {
if (tryT2IndexedLoad(N))
return;
@@ -2998,13 +3089,26 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
// Other cases are autogenerated.
break;
}
- case ARMISD::WLS: {
- SDValue Ops[] = { N->getOperand(1), // Loop count
- N->getOperand(2), // Exit target
+ case ARMISD::WLS:
+ case ARMISD::LE: {
+ SDValue Ops[] = { N->getOperand(1),
+ N->getOperand(2),
+ N->getOperand(0) };
+ unsigned Opc = N->getOpcode() == ARMISD::WLS ?
+ ARM::t2WhileLoopStart : ARM::t2LoopEnd;
+ SDNode *New = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ ReplaceUses(N, New);
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
+ case ARMISD::LOOP_DEC: {
+ SDValue Ops[] = { N->getOperand(1),
+ N->getOperand(2),
N->getOperand(0) };
- SDNode *LoopStart =
- CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other, Ops);
- ReplaceUses(N, LoopStart);
+ SDNode *Dec =
+ CurDAG->getMachineNode(ARM::t2LoopDec, dl,
+ CurDAG->getVTList(MVT::i32, MVT::Other), Ops);
+ ReplaceUses(N, Dec);
CurDAG->RemoveDeadNode(N);
return;
}
@@ -4365,7 +4469,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
// Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
// the original GPRs.
- unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+ Register GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
SDValue Chain = SDValue(N,0);
@@ -4401,7 +4505,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
// Copy REG_SEQ into a GPRPair-typed VR and replace the original two
// i32 VRs of inline asm with it.
- unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+ Register GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 18bb9bf3eccc..db26feb57010 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -245,7 +245,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
for (auto VT : IntTypes) {
- addRegisterClass(VT, &ARM::QPRRegClass);
+ addRegisterClass(VT, &ARM::MQPRRegClass);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -258,12 +258,31 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::CTLZ, VT, Legal);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+ setOperationAction(ISD::BITREVERSE, VT, Legal);
+ setOperationAction(ISD::BSWAP, VT, Legal);
+ setOperationAction(ISD::SADDSAT, VT, Legal);
+ setOperationAction(ISD::UADDSAT, VT, Legal);
+ setOperationAction(ISD::SSUBSAT, VT, Legal);
+ setOperationAction(ISD::USUBSAT, VT, Legal);
// No native support for these.
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::CTPOP, VT, Expand);
+
+ // Vector reductions
+ setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
if (!HasMVEFP) {
setOperationAction(ISD::SINT_TO_FP, VT, Expand);
@@ -271,11 +290,18 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::FP_TO_SINT, VT, Expand);
setOperationAction(ISD::FP_TO_UINT, VT, Expand);
}
+
+ // Pre and Post inc are supported on loads and stores
+ for (unsigned im = (unsigned)ISD::PRE_INC;
+ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+ setIndexedLoadAction(im, VT, Legal);
+ setIndexedStoreAction(im, VT, Legal);
+ }
}
const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
for (auto VT : FloatTypes) {
- addRegisterClass(VT, &ARM::QPRRegClass);
+ addRegisterClass(VT, &ARM::MQPRRegClass);
if (!HasMVEFP)
setAllExpand(VT);
@@ -287,6 +313,16 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+
+ // Pre and Post inc are supported on loads and stores
+ for (unsigned im = (unsigned)ISD::PRE_INC;
+ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+ setIndexedLoadAction(im, VT, Legal);
+ setIndexedStoreAction(im, VT, Legal);
+ }
if (HasMVEFP) {
setOperationAction(ISD::FMINNUM, VT, Legal);
@@ -314,7 +350,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
// vector types is inhibited at integer-only level.
const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
for (auto VT : LongTypes) {
- addRegisterClass(VT, &ARM::QPRRegClass);
+ addRegisterClass(VT, &ARM::MQPRRegClass);
setAllExpand(VT);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -334,6 +370,33 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
+
+ // Pre and Post inc on these are legal, given the correct extends
+ for (unsigned im = (unsigned)ISD::PRE_INC;
+ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+ setIndexedLoadAction(im, MVT::v8i8, Legal);
+ setIndexedStoreAction(im, MVT::v8i8, Legal);
+ setIndexedLoadAction(im, MVT::v4i8, Legal);
+ setIndexedStoreAction(im, MVT::v4i8, Legal);
+ setIndexedLoadAction(im, MVT::v4i16, Legal);
+ setIndexedStoreAction(im, MVT::v4i16, Legal);
+ }
+
+ // Predicate types
+ const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1};
+ for (auto VT : pTypes) {
+ addRegisterClass(VT, &ARM::VCCRRegClass);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
+ setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
+ }
}
ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
@@ -645,8 +708,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
}
- for (MVT VT : MVT::vector_valuetypes()) {
- for (MVT InnerVT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+ for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
addAllExtLoads(VT, InnerVT, Expand);
}
@@ -669,8 +732,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
addMVEVectorTypes(Subtarget->hasMVEFloatOps());
// Combine low-overhead loop intrinsics so that we can lower i1 types.
- if (Subtarget->hasLOB())
+ if (Subtarget->hasLOB()) {
setTargetDAGCombine(ISD::BRCOND);
+ setTargetDAGCombine(ISD::BR_CC);
+ }
if (Subtarget->hasNEON()) {
addDRTypeForNEON(MVT::v2f32);
@@ -837,10 +902,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::SRA);
- setTargetDAGCombine(ISD::SIGN_EXTEND);
- setTargetDAGCombine(ISD::ZERO_EXTEND);
- setTargetDAGCombine(ISD::ANY_EXTEND);
- setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::FP_TO_UINT);
setTargetDAGCombine(ISD::FDIV);
@@ -849,7 +910,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// It is legal to extload from v4i8 to v4i16 or v4i32.
for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
MVT::v2i32}) {
- for (MVT VT : MVT::integer_vector_valuetypes()) {
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
@@ -861,6 +922,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
}
if (!Subtarget->hasFP64()) {
@@ -901,9 +966,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
}
- if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){
+ if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
- setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ if (Subtarget->hasFullFP16())
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
}
if (!Subtarget->hasFP16())
@@ -955,6 +1021,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
+ if (Subtarget->hasDSP()) {
+ setOperationAction(ISD::SADDSAT, MVT::i8, Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
+ setOperationAction(ISD::SADDSAT, MVT::i16, Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
+ }
+ if (Subtarget->hasBaseDSP()) {
+ setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
+ setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
+ }
// i64 operation support.
setOperationAction(ISD::MUL, MVT::i64, Expand);
@@ -972,6 +1048,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
setOperationAction(ISD::SRL, MVT::i64, Custom);
setOperationAction(ISD::SRA, MVT::i64, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
// MVE lowers 64 bit shifts to lsll and lsrl
@@ -991,7 +1068,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// ARM does not have ROTL.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
}
@@ -1365,14 +1442,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
// On ARM arguments smaller than 4 bytes are extended, so all arguments
// are at least 4 bytes aligned.
- setMinStackArgumentAlignment(4);
+ setMinStackArgumentAlignment(Align(4));
// Prefer likely predicted branches to selects on out-of-order cores.
PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
- setPrefLoopAlignment(Subtarget->getPrefLoopAlignment());
+ setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
- setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
+ setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
if (Subtarget->isThumb() || Subtarget->isThumb2())
setTargetDAGCombine(ISD::ABS);
@@ -1472,6 +1549,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::ADDE: return "ARMISD::ADDE";
case ARMISD::SUBC: return "ARMISD::SUBC";
case ARMISD::SUBE: return "ARMISD::SUBE";
+ case ARMISD::LSLS: return "ARMISD::LSLS";
case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD";
case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR";
@@ -1496,16 +1574,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK";
case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK";
- case ARMISD::VCEQ: return "ARMISD::VCEQ";
- case ARMISD::VCEQZ: return "ARMISD::VCEQZ";
- case ARMISD::VCGE: return "ARMISD::VCGE";
- case ARMISD::VCGEZ: return "ARMISD::VCGEZ";
- case ARMISD::VCLEZ: return "ARMISD::VCLEZ";
- case ARMISD::VCGEU: return "ARMISD::VCGEU";
- case ARMISD::VCGT: return "ARMISD::VCGT";
- case ARMISD::VCGTZ: return "ARMISD::VCGTZ";
- case ARMISD::VCLTZ: return "ARMISD::VCLTZ";
- case ARMISD::VCGTU: return "ARMISD::VCGTU";
+ case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST";
+ case ARMISD::VCMP: return "ARMISD::VCMP";
+ case ARMISD::VCMPZ: return "ARMISD::VCMPZ";
case ARMISD::VTST: return "ARMISD::VTST";
case ARMISD::VSHLs: return "ARMISD::VSHLs";
@@ -1543,6 +1614,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VTRN: return "ARMISD::VTRN";
case ARMISD::VTBL1: return "ARMISD::VTBL1";
case ARMISD::VTBL2: return "ARMISD::VTBL2";
+ case ARMISD::VMOVN: return "ARMISD::VMOVN";
case ARMISD::VMULLs: return "ARMISD::VMULLs";
case ARMISD::VMULLu: return "ARMISD::VMULLu";
case ARMISD::UMAAL: return "ARMISD::UMAAL";
@@ -1560,6 +1632,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX";
case ARMISD::SMMLAR: return "ARMISD::SMMLAR";
case ARMISD::SMMLSR: return "ARMISD::SMMLSR";
+ case ARMISD::QADD16b: return "ARMISD::QADD16b";
+ case ARMISD::QSUB16b: return "ARMISD::QSUB16b";
+ case ARMISD::QADD8b: return "ARMISD::QADD8b";
+ case ARMISD::QSUB8b: return "ARMISD::QSUB8b";
case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
case ARMISD::BFI: return "ARMISD::BFI";
case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
@@ -1589,6 +1665,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
case ARMISD::WLS: return "ARMISD::WLS";
+ case ARMISD::LE: return "ARMISD::LE";
+ case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC";
+ case ARMISD::CSINV: return "ARMISD::CSINV";
+ case ARMISD::CSNEG: return "ARMISD::CSNEG";
+ case ARMISD::CSINC: return "ARMISD::CSINC";
}
return nullptr;
}
@@ -1597,6 +1678,11 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
EVT VT) const {
if (!VT.isVector())
return getPointerTy(DL);
+
+ // MVE has a predicate register.
+ if (Subtarget->hasMVEIntegerOps() &&
+ (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8))
+ return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
return VT.changeVectorElementTypeToInteger();
}
@@ -1726,34 +1812,22 @@ static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
- ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) {
+ ARMCC::CondCodes &CondCode2) {
CondCode2 = ARMCC::AL;
- InvalidOnQNaN = true;
switch (CC) {
default: llvm_unreachable("Unknown FP condition!");
case ISD::SETEQ:
- case ISD::SETOEQ:
- CondCode = ARMCC::EQ;
- InvalidOnQNaN = false;
- break;
+ case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
case ISD::SETGT:
case ISD::SETOGT: CondCode = ARMCC::GT; break;
case ISD::SETGE:
case ISD::SETOGE: CondCode = ARMCC::GE; break;
case ISD::SETOLT: CondCode = ARMCC::MI; break;
case ISD::SETOLE: CondCode = ARMCC::LS; break;
- case ISD::SETONE:
- CondCode = ARMCC::MI;
- CondCode2 = ARMCC::GT;
- InvalidOnQNaN = false;
- break;
+ case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
case ISD::SETO: CondCode = ARMCC::VC; break;
case ISD::SETUO: CondCode = ARMCC::VS; break;
- case ISD::SETUEQ:
- CondCode = ARMCC::EQ;
- CondCode2 = ARMCC::VS;
- InvalidOnQNaN = false;
- break;
+ case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
case ISD::SETUGT: CondCode = ARMCC::HI; break;
case ISD::SETUGE: CondCode = ARMCC::PL; break;
case ISD::SETLT:
@@ -1761,10 +1835,7 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
case ISD::SETLE:
case ISD::SETULE: CondCode = ARMCC::LE; break;
case ISD::SETNE:
- case ISD::SETUNE:
- CondCode = ARMCC::NE;
- InvalidOnQNaN = false;
- break;
+ case ISD::SETUNE: CondCode = ARMCC::NE; break;
}
}
@@ -1988,6 +2059,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool isVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
+ MachineFunction::CallSiteInfo CSInfo;
bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
bool isThisReturn = false;
auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
@@ -2112,6 +2184,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
"unexpected use of 'returned'");
isThisReturn = true;
}
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.EnableDebugEntryValues)
+ CSInfo.emplace_back(VA.getLocReg(), i);
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else if (isByVal) {
assert(VA.isMemLoc());
@@ -2347,12 +2422,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
if (isTailCall) {
MF.getFrameInfo().setHasTailCall();
- return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
+ SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
+ DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+ return Ret;
}
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
InFlag = Chain.getValue(1);
+ DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
@@ -2431,7 +2509,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
int FI = std::numeric_limits<int>::max();
if (Arg.getOpcode() == ISD::CopyFromReg) {
unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
- if (!TargetRegisterInfo::isVirtualRegister(VR))
+ if (!Register::isVirtualRegister(VR))
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
@@ -3047,12 +3125,12 @@ ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
// Load the current TEB (thread environment block)
SDValue Ops[] = {Chain,
- DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
- DAG.getConstant(15, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(13, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(2, DL, MVT::i32)};
+ DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+ DAG.getTargetConstant(15, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32),
+ DAG.getTargetConstant(13, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32),
+ DAG.getTargetConstant(2, DL, MVT::i32)};
SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
DAG.getVTList(MVT::i32, MVT::Other), Ops);
@@ -3498,6 +3576,48 @@ SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
Op.getOperand(0));
}
+SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
+ SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
+ unsigned IntNo =
+ cast<ConstantSDNode>(
+ Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other))
+ ->getZExtValue();
+ switch (IntNo) {
+ default:
+ return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::arm_gnu_eabi_mcount: {
+ MachineFunction &MF = DAG.getMachineFunction();
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc dl(Op);
+ SDValue Chain = Op.getOperand(0);
+ // call "\01__gnu_mcount_nc"
+ const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
+ const uint32_t *Mask =
+ ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ // Mark LR an implicit live-in.
+ unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
+ SDValue ReturnAddress =
+ DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
+ std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue};
+ SDValue Callee =
+ DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
+ SDValue RegisterMask = DAG.getRegisterMask(Mask);
+ if (Subtarget->isThumb())
+ return SDValue(
+ DAG.getMachineNode(
+ ARM::tBL_PUSHLR, dl, ResultTys,
+ {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
+ DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
+ 0);
+ return SDValue(
+ DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
+ {ReturnAddress, Callee, RegisterMask, Chain}),
+ 0);
+ }
+ }
+}
+
SDValue
ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) const {
@@ -3898,6 +4018,12 @@ SDValue ARMTargetLowering::LowerFormalArguments(
// Transform the arguments in physical registers into virtual ones.
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+ // If this value is passed in r0 and has the returned attribute (e.g.
+ // C++ 'structors), record this fact for later use.
+ if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
+ AFI->setPreservesR0();
+ }
}
// If this is an 8 or 16-bit value, it is really passed promoted
@@ -4049,6 +4175,67 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
std::swap(LHS, RHS);
}
+ // Thumb1 has very limited immediate modes, so turning an "and" into a
+ // shift can save multiple instructions.
+ //
+ // If we have (x & C1), and C1 is an appropriate mask, we can transform it
+ // into "((x << n) >> n)". But that isn't necessarily profitable on its
+ // own. If it's the operand to an unsigned comparison with an immediate,
+ // we can eliminate one of the shifts: we transform
+ // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
+ //
+ // We avoid transforming cases which aren't profitable due to encoding
+ // details:
+ //
+ // 1. C2 fits into the immediate field of a cmp, and the transformed version
+ // would not; in that case, we're essentially trading one immediate load for
+ // another.
+ // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
+ // 3. C2 is zero; we have other code for this special case.
+ //
+ // FIXME: Figure out profitability for Thumb2; we usually can't save an
+ // instruction, since the AND is always one instruction anyway, but we could
+ // use narrow instructions in some cases.
+ if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
+ LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
+ !isSignedIntSetCC(CC)) {
+ unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue();
+ auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
+ uint64_t RHSV = RHSC->getZExtValue();
+ if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
+ unsigned ShiftBits = countLeadingZeros(Mask);
+ if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
+ SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
+ LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
+ RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
+ }
+ }
+ }
+
+ // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
+ // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
+ // way a cmp would.
+ // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
+ // some tweaks to the heuristics for the previous and->shift transform.
+ // FIXME: Optimize cases where the LHS isn't a shift.
+ if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
+ isa<ConstantSDNode>(RHS) &&
+ cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
+ CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) {
+ unsigned ShiftAmt =
+ cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1;
+ SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
+ DAG.getVTList(MVT::i32, MVT::i32),
+ LHS.getOperand(0),
+ DAG.getConstant(ShiftAmt, dl, MVT::i32));
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
+ Shift.getValue(1), SDValue());
+ ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
+ return Chain.getValue(1);
+ }
+
ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
// If the RHS is a constant zero then the V (overflow) flag will never be
@@ -4083,15 +4270,13 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
- SelectionDAG &DAG, const SDLoc &dl,
- bool InvalidOnQNaN) const {
+ SelectionDAG &DAG, const SDLoc &dl) const {
assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
SDValue Cmp;
- SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
if (!isFloatingPointZero(RHS))
- Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C);
+ Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
else
- Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C);
+ Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
}
@@ -4108,12 +4293,10 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
Cmp = Cmp.getOperand(0);
Opc = Cmp.getOpcode();
if (Opc == ARMISD::CMPFP)
- Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
- Cmp.getOperand(1), Cmp.getOperand(2));
+ Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
else {
assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
- Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
- Cmp.getOperand(1));
+ Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
}
return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
}
@@ -4276,6 +4459,35 @@ SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
}
+static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ EVT VT = Op.getValueType();
+ if (!Subtarget->hasDSP())
+ return SDValue();
+ if (!VT.isSimple())
+ return SDValue();
+
+ unsigned NewOpcode;
+ bool IsAdd = Op->getOpcode() == ISD::SADDSAT;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::i8:
+ NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b;
+ break;
+ case MVT::i16:
+ NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b;
+ break;
+ }
+
+ SDLoc dl(Op);
+ SDValue Add =
+ DAG.getNode(NewOpcode, dl, MVT::i32,
+ DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
+ DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
+}
+
SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Cond = Op.getOperand(0);
SDValue SelectTrue = Op.getOperand(1);
@@ -4656,10 +4868,62 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDValue TrueVal = Op.getOperand(2);
SDValue FalseVal = Op.getOperand(3);
+ ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
+ ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
+
+ if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
+ LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
+ unsigned TVal = CTVal->getZExtValue();
+ unsigned FVal = CFVal->getZExtValue();
+ unsigned Opcode = 0;
+
+ if (TVal == ~FVal) {
+ Opcode = ARMISD::CSINV;
+ } else if (TVal == ~FVal + 1) {
+ Opcode = ARMISD::CSNEG;
+ } else if (TVal + 1 == FVal) {
+ Opcode = ARMISD::CSINC;
+ } else if (TVal == FVal + 1) {
+ Opcode = ARMISD::CSINC;
+ std::swap(TrueVal, FalseVal);
+ std::swap(TVal, FVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+
+ if (Opcode) {
+ // If one of the constants is cheaper than another, materialise the
+ // cheaper one and let the csel generate the other.
+ if (Opcode != ARMISD::CSINC &&
+ HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
+ std::swap(TrueVal, FalseVal);
+ std::swap(TVal, FVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+
+ // Attempt to use ZR checking TVal is 0, possibly inverting the condition
+ // to get there. CSINC not is invertable like the other two (~(~a) == a,
+ // -(-a) == a, but (a+1)+1 != a).
+ if (FVal == 0 && Opcode != ARMISD::CSINC) {
+ std::swap(TrueVal, FalseVal);
+ std::swap(TVal, FVal);
+ CC = ISD::getSetCCInverse(CC, true);
+ }
+ if (TVal == 0)
+ TrueVal = DAG.getRegister(ARM::ZR, MVT::i32);
+
+ // Drops F's value because we can get it by inverting/negating TVal.
+ FalseVal = TrueVal;
+
+ SDValue ARMcc;
+ SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+ EVT VT = TrueVal.getValueType();
+ return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
+ }
+ }
if (isUnsupportedFloatingType(LHS.getValueType())) {
DAG.getTargetLoweringInfo().softenSetCCOperands(
- DAG, LHS.getValueType(), LHS, RHS, CC, dl);
+ DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
// If softenSetCCOperands only returned one value, we should compare it to
// zero.
@@ -4701,8 +4965,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
}
ARMCC::CondCodes CondCode, CondCode2;
- bool InvalidOnQNaN;
- FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
+ FPCCToARMCC(CC, CondCode, CondCode2);
// Normalize the fp compare. If RHS is zero we prefer to keep it there so we
// match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
@@ -4727,13 +4990,13 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
}
SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
- SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
+ SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
if (CondCode2 != ARMCC::AL) {
SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
// FIXME: Needs another CMP because flag can have but one use.
- SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
+ SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
}
return Result;
@@ -4903,7 +5166,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
if (isUnsupportedFloatingType(LHS.getValueType())) {
DAG.getTargetLoweringInfo().softenSetCCOperands(
- DAG, LHS.getValueType(), LHS, RHS, CC, dl);
+ DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
// If softenSetCCOperands only returned one value, we should compare it to
// zero.
@@ -4960,11 +5223,10 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
}
ARMCC::CondCodes CondCode, CondCode2;
- bool InvalidOnQNaN;
- FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
+ FPCCToARMCC(CC, CondCode, CondCode2);
SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
- SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
+ SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
@@ -5056,8 +5318,9 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
else
LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
Op.getValueType());
+ MakeLibCallOptions CallOptions;
return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
- /*isSigned*/ false, SDLoc(Op)).first;
+ CallOptions, SDLoc(Op)).first;
}
return Op;
@@ -5120,8 +5383,9 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
else
LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
Op.getValueType());
+ MakeLibCallOptions CallOptions;
return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
- /*isSigned*/ false, SDLoc(Op)).first;
+ CallOptions, SDLoc(Op)).first;
}
return Op;
@@ -5140,7 +5404,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
if (UseNEON) {
// Use VBSL to copy the sign bit.
- unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
+ unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
@@ -5163,7 +5427,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
- SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
+ SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),
dl, MVT::i32);
AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
@@ -5243,7 +5507,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- unsigned FrameReg = ARI.getFrameRegister(MF);
+ Register FrameReg = ARI.getFrameRegister(MF);
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
while (Depth--)
FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
@@ -5253,9 +5517,9 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const {
- unsigned Reg = StringSwitch<unsigned>(RegName)
+Register ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const {
+ Register Reg = StringSwitch<unsigned>(RegName)
.Case("sp", ARM::SP)
.Default(0);
if (Reg)
@@ -5576,8 +5840,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
- if (VT.isVector()) {
- assert(ST->hasNEON());
+ if (VT.isVector() && ST->hasNEON()) {
// Compute the least significant set bit: LSB = X & -X
SDValue X = N->getOperand(0);
@@ -5777,14 +6040,15 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
unsigned ShPartsOpc = ARMISD::LSLL;
ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
- // If the shift amount is greater than 32 then do the default optimisation
- if (Con && Con->getZExtValue() > 32)
+ // If the shift amount is greater than 32 or has a greater bitwidth than 64
+ // then do the default optimisation
+ if (ShAmt->getValueType(0).getSizeInBits() > 64 ||
+ (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32)))
return SDValue();
- // Extract the lower 32 bits of the shift amount if it's an i64
- if (ShAmt->getValueType(0) == MVT::i64)
- ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt,
- DAG.getConstant(0, dl, MVT::i32));
+ // Extract the lower 32 bits of the shift amount if it's not an i32
+ if (ShAmt->getValueType(0) != MVT::i32)
+ ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
if (ShOpc == ISD::SRL) {
if (!Con)
@@ -5839,20 +6103,37 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
}
-static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
- SDValue TmpOp0, TmpOp1;
+static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
bool Invert = false;
bool Swap = false;
- unsigned Opc = 0;
+ unsigned Opc = ARMCC::AL;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue CC = Op.getOperand(2);
- EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
EVT VT = Op.getValueType();
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
SDLoc dl(Op);
+ EVT CmpVT;
+ if (ST->hasNEON())
+ CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
+ else {
+ assert(ST->hasMVEIntegerOps() &&
+ "No hardware support for integer vector comparison!");
+
+ if (Op.getValueType().getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ // Make sure we expand floating point setcc to scalar if we do not have
+ // mve.fp, so that we can handle them from there.
+ if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
+ return SDValue();
+
+ CmpVT = VT;
+ }
+
if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
(SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
// Special-case integer 64-bit equality comparisons. They aren't legal,
@@ -5880,60 +6161,74 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
switch (SetCCOpcode) {
default: llvm_unreachable("Illegal FP comparison");
case ISD::SETUNE:
- case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETNE:
+ if (ST->hasMVEFloatOps()) {
+ Opc = ARMCC::NE; break;
+ } else {
+ Invert = true; LLVM_FALLTHROUGH;
+ }
case ISD::SETOEQ:
- case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
+ case ISD::SETEQ: Opc = ARMCC::EQ; break;
case ISD::SETOLT:
case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETOGT:
- case ISD::SETGT: Opc = ARMISD::VCGT; break;
+ case ISD::SETGT: Opc = ARMCC::GT; break;
case ISD::SETOLE:
case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETOGE:
- case ISD::SETGE: Opc = ARMISD::VCGE; break;
+ case ISD::SETGE: Opc = ARMCC::GE; break;
case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
+ case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
+ case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
- case ISD::SETONE:
+ case ISD::SETONE: {
// Expand this to (OLT | OGT).
- TmpOp0 = Op0;
- TmpOp1 = Op1;
- Opc = ISD::OR;
- Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
- Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
- break;
- case ISD::SETUO:
- Invert = true;
- LLVM_FALLTHROUGH;
- case ISD::SETO:
+ SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
+ DAG.getConstant(ARMCC::GT, dl, MVT::i32));
+ SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
+ DAG.getConstant(ARMCC::GT, dl, MVT::i32));
+ SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+ return Result;
+ }
+ case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETO: {
// Expand this to (OLT | OGE).
- TmpOp0 = Op0;
- TmpOp1 = Op1;
- Opc = ISD::OR;
- Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
- Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
- break;
+ SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
+ DAG.getConstant(ARMCC::GT, dl, MVT::i32));
+ SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
+ DAG.getConstant(ARMCC::GE, dl, MVT::i32));
+ SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+ return Result;
+ }
}
} else {
// Integer comparisons.
switch (SetCCOpcode) {
default: llvm_unreachable("Illegal integer comparison");
- case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH;
- case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
+ case ISD::SETNE:
+ if (ST->hasMVEIntegerOps()) {
+ Opc = ARMCC::NE; break;
+ } else {
+ Invert = true; LLVM_FALLTHROUGH;
+ }
+ case ISD::SETEQ: Opc = ARMCC::EQ; break;
case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETGT: Opc = ARMISD::VCGT; break;
+ case ISD::SETGT: Opc = ARMCC::GT; break;
case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETGE: Opc = ARMISD::VCGE; break;
+ case ISD::SETGE: Opc = ARMCC::GE; break;
case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
+ case ISD::SETUGT: Opc = ARMCC::HI; break;
case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
- case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
+ case ISD::SETUGE: Opc = ARMCC::HS; break;
}
// Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
- if (Opc == ARMISD::VCEQ) {
+ if (ST->hasNEON() && Opc == ARMCC::EQ) {
SDValue AndOp;
if (ISD::isBuildVectorAllZeros(Op1.getNode()))
AndOp = Op0;
@@ -5945,10 +6240,12 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
AndOp = AndOp.getOperand(0);
if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
- Opc = ARMISD::VTST;
Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
- Invert = !Invert;
+ SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
+ if (!Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+ return Result;
}
}
}
@@ -5962,31 +6259,20 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
if (ISD::isBuildVectorAllZeros(Op1.getNode()))
SingleOp = Op0;
else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
- if (Opc == ARMISD::VCGE)
- Opc = ARMISD::VCLEZ;
- else if (Opc == ARMISD::VCGT)
- Opc = ARMISD::VCLTZ;
+ if (Opc == ARMCC::GE)
+ Opc = ARMCC::LE;
+ else if (Opc == ARMCC::GT)
+ Opc = ARMCC::LT;
SingleOp = Op1;
}
SDValue Result;
if (SingleOp.getNode()) {
- switch (Opc) {
- case ARMISD::VCEQ:
- Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
- case ARMISD::VCGE:
- Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
- case ARMISD::VCLEZ:
- Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
- case ARMISD::VCGT:
- Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
- case ARMISD::VCLTZ:
- Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
- default:
- Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
- }
+ Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp,
+ DAG.getConstant(Opc, dl, MVT::i32));
} else {
- Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
+ Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
+ DAG.getConstant(Opc, dl, MVT::i32));
}
Result = DAG.getSExtOrTrunc(Result, dl, VT);
@@ -6027,13 +6313,13 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
CCR, Chain.getValue(1));
}
-/// isNEONModifiedImm - Check if the specified splat value corresponds to a
-/// valid vector constant for a NEON or MVE instruction with a "modified immediate"
-/// operand (e.g., VMOV). If so, return the encoded value.
-static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
+/// valid vector constant for a NEON or MVE instruction with a "modified
+/// immediate" operand (e.g., VMOV). If so, return the encoded value.
+static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
unsigned SplatBitSize, SelectionDAG &DAG,
const SDLoc &dl, EVT &VT, bool is128Bits,
- NEONModImmType type) {
+ VMOVModImmType type) {
unsigned OpCmode, Imm;
// SplatBitSize is set to the smallest size that splats the vector, so a
@@ -6163,10 +6449,10 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
}
default:
- llvm_unreachable("unexpected size for isNEONModifiedImm");
+ llvm_unreachable("unexpected size for isVMOVModifiedImm");
}
- unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
+ unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
}
@@ -6246,7 +6532,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
return SDValue();
// Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
- SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
+ SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
VMovVT, false, VMOVModImm);
if (NewVal != SDValue()) {
SDLoc DL(Op);
@@ -6263,7 +6549,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
}
// Finally, try a VMVN.i32
- NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
+ NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
false, VMVNModImm);
if (NewVal != SDValue()) {
SDLoc DL(Op);
@@ -6649,6 +6935,29 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) {
return true;
}
+static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) {
+ unsigned NumElts = VT.getVectorNumElements();
+ // Make sure the mask has the right size.
+ if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
+ return false;
+
+ // If Top
+ // Look for <0, N, 2, N+2, 4, N+4, ..>.
+ // This inserts Input2 into Input1
+ // else if not Top
+ // Look for <0, N+1, 2, N+3, 4, N+5, ..>
+ // This inserts Input1 into Input2
+ unsigned Offset = Top ? 0 : 1;
+ for (unsigned i = 0; i < NumElts; i+=2) {
+ if (M[i] >= 0 && M[i] != (int)i)
+ return false;
+ if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset))
+ return false;
+ }
+
+ return true;
+}
+
// If N is an integer constant that can be moved into a register in one
// instruction, return an SDValue of such a constant (will become a MOV
// instruction). Otherwise return null.
@@ -6669,6 +6978,66 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned BoolMask;
+ unsigned BitsPerBool;
+ if (NumElts == 4) {
+ BitsPerBool = 4;
+ BoolMask = 0xf;
+ } else if (NumElts == 8) {
+ BitsPerBool = 2;
+ BoolMask = 0x3;
+ } else if (NumElts == 16) {
+ BitsPerBool = 1;
+ BoolMask = 0x1;
+ } else
+ return SDValue();
+
+ // If this is a single value copied into all lanes (a splat), we can just sign
+ // extend that single value
+ SDValue FirstOp = Op.getOperand(0);
+ if (!isa<ConstantSDNode>(FirstOp) &&
+ std::all_of(std::next(Op->op_begin()), Op->op_end(),
+ [&FirstOp](SDUse &U) {
+ return U.get().isUndef() || U.get() == FirstOp;
+ })) {
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
+ DAG.getValueType(MVT::i1));
+ return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
+ }
+
+ // First create base with bits set where known
+ unsigned Bits32 = 0;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (!isa<ConstantSDNode>(V) && !V.isUndef())
+ continue;
+ bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue();
+ if (BitSet)
+ Bits32 |= BoolMask << (i * BitsPerBool);
+ }
+
+ // Add in unknown nodes
+ SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
+ DAG.getConstant(Bits32, dl, MVT::i32));
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue V = Op.getOperand(i);
+ if (isa<ConstantSDNode>(V) || V.isUndef())
+ continue;
+ Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
+ DAG.getConstant(i, dl, MVT::i32));
+ }
+
+ return Base;
+}
+
// If this is a case we can't handle, return null and let the default
// expansion code take care of it.
SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
@@ -6677,6 +7046,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
SDLoc dl(Op);
EVT VT = Op.getValueType();
+ if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
+ return LowerBUILD_VECTOR_i1(Op, DAG, ST);
+
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
@@ -6688,7 +7060,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
(ST->hasMVEIntegerOps() && SplatBitSize <= 32)) {
// Check if an immediate VMOV works.
EVT VmovVT;
- SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+ SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VmovVT, VT.is128BitVector(),
VMOVModImm);
@@ -6700,7 +7072,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
// Try an immediate VMVN.
uint64_t NegatedImm = (~SplatBits).getZExtValue();
- Val = isNEONModifiedImm(
+ Val = isVMOVModifiedImm(
NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VmovVT, VT.is128BitVector(),
ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
@@ -7088,9 +7460,6 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
LaneMask[j] = ExtractBase + j;
}
- // Final check before we try to produce nonsense...
- if (!isShuffleMaskLegal(Mask, ShuffleVT))
- return SDValue();
// We can't handle more than two sources. This should have already
// been checked before this point.
@@ -7100,8 +7469,10 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
for (unsigned i = 0; i < Sources.size(); ++i)
ShuffleOps[i] = Sources[i].ShuffleVec;
- SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
- ShuffleOps[1], Mask);
+ SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+ ShuffleOps[1], Mask, DAG);
+ if (!Shuffle)
+ return SDValue();
return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
}
@@ -7168,6 +7539,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
unsigned EltSize = VT.getScalarSizeInBits();
if (EltSize >= 32 ||
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+ ShuffleVectorInst::isIdentityMask(M) ||
isVREVMask(M, VT, 64) ||
isVREVMask(M, VT, 32) ||
isVREVMask(M, VT, 16))
@@ -7180,6 +7552,9 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
isReverseMask(M, VT))
return true;
+ else if (Subtarget->hasMVEIntegerOps() &&
+ (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1)))
+ return true;
else
return false;
}
@@ -7282,6 +7657,94 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
DAG.getConstant(ExtractNum, DL, MVT::i32));
}
+static EVT getVectorTyFromPredicateVector(EVT VT) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::v4i1:
+ return MVT::v4i32;
+ case MVT::v8i1:
+ return MVT::v8i16;
+ case MVT::v16i1:
+ return MVT::v16i8;
+ default:
+ llvm_unreachable("Unexpected vector predicate type");
+ }
+}
+
+static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT,
+ SelectionDAG &DAG) {
+ // Converting from boolean predicates to integers involves creating a vector
+ // of all ones or all zeroes and selecting the lanes based upon the real
+ // predicate.
+ SDValue AllOnes =
+ DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
+ AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
+
+ SDValue AllZeroes =
+ DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
+ AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
+
+ // Get full vector type from predicate type
+ EVT NewVT = getVectorTyFromPredicateVector(VT);
+
+ SDValue RecastV1;
+ // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
+ // this to a v16i1. This cannot be done with an ordinary bitcast because the
+ // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
+ // since we know in hardware the sizes are really the same.
+ if (VT != MVT::v16i1)
+ RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
+ else
+ RecastV1 = Pred;
+
+ // Select either all ones or zeroes depending upon the real predicate bits.
+ SDValue PredAsVector =
+ DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
+
+ // Recast our new predicate-as-integer v16i8 vector into something
+ // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
+ return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
+}
+
+static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VT = Op.getValueType();
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+ ArrayRef<int> ShuffleMask = SVN->getMask();
+
+ assert(ST->hasMVEIntegerOps() &&
+ "No support for vector shuffle of boolean predicates");
+
+ SDValue V1 = Op.getOperand(0);
+ SDLoc dl(Op);
+ if (isReverseMask(ShuffleMask, VT)) {
+ SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
+ SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
+ SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
+ DAG.getConstant(16, dl, MVT::i32));
+ return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
+ }
+
+ // Until we can come up with optimised cases for every single vector
+ // shuffle in existence we have chosen the least painful strategy. This is
+ // to essentially promote the boolean predicate to a 8-bit integer, where
+ // each predicate represents a byte. Then we fall back on a normal integer
+ // vector shuffle and convert the result back into a predicate vector. In
+ // many cases the generated code might be even better than scalar code
+ // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
+ // fields in a register into 8 other arbitrary 2-bit fields!
+ SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG);
+ EVT NewVT = PredAsVector.getValueType();
+
+ // Do the shuffle!
+ SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector,
+ DAG.getUNDEF(NewVT), ShuffleMask);
+
+ // Now return the result of comparing the shuffled vector with zero,
+ // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+ return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
+ DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+}
+
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDValue V1 = Op.getOperand(0);
@@ -7289,6 +7752,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
SDLoc dl(Op);
EVT VT = Op.getValueType();
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+ unsigned EltSize = VT.getScalarSizeInBits();
+
+ if (ST->hasMVEIntegerOps() && EltSize == 1)
+ return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
// Convert shuffles that are directly supported on NEON to target-specific
// DAG nodes, instead of keeping them as shuffles and matching them again
@@ -7298,7 +7765,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
// of the same time so that they get CSEd properly.
ArrayRef<int> ShuffleMask = SVN->getMask();
- unsigned EltSize = VT.getScalarSizeInBits();
if (EltSize <= 32) {
if (SVN->isSplat()) {
int Lane = SVN->getSplatIndex();
@@ -7364,6 +7830,14 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
.getValue(WhichResult);
}
}
+ if (ST->hasMVEIntegerOps()) {
+ if (isVMOVNMask(ShuffleMask, VT, 0))
+ return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
+ DAG.getConstant(0, dl, MVT::i32));
+ if (isVMOVNMask(ShuffleMask, VT, 1))
+ return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
+ DAG.getConstant(1, dl, MVT::i32));
+ }
// Also check for these shuffles through CONCAT_VECTORS: we canonicalize
// shuffles that produce a result larger than their operands with:
@@ -7468,8 +7942,29 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
-SDValue ARMTargetLowering::
-LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VecVT = Op.getOperand(0).getValueType();
+ SDLoc dl(Op);
+
+ assert(ST->hasMVEIntegerOps() &&
+ "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
+
+ SDValue Conv =
+ DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
+ unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ unsigned LaneWidth =
+ getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
+ unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
+ Op.getOperand(1), DAG.getValueType(MVT::i1));
+ SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
+ DAG.getConstant(~Mask, dl, MVT::i32));
+ return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
+}
+
+SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
// INSERT_VECTOR_ELT is legal only for immediate indexes.
SDValue Lane = Op.getOperand(2);
if (!isa<ConstantSDNode>(Lane))
@@ -7477,6 +7972,11 @@ LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
SDValue Elt = Op.getOperand(1);
EVT EltVT = Elt.getValueType();
+
+ if (Subtarget->hasMVEIntegerOps() &&
+ Op.getValueType().getScalarSizeInBits() == 1)
+ return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
+
if (getTypeAction(*DAG.getContext(), EltVT) ==
TargetLowering::TypePromoteFloat) {
// INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
@@ -7505,13 +8005,37 @@ LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
-static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VecVT = Op.getOperand(0).getValueType();
+ SDLoc dl(Op);
+
+ assert(ST->hasMVEIntegerOps() &&
+ "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
+
+ SDValue Conv =
+ DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
+ unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ unsigned LaneWidth =
+ getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
+ SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
+ DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
+ return Shift;
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
// EXTRACT_VECTOR_ELT is legal only for immediate indexes.
SDValue Lane = Op.getOperand(1);
if (!isa<ConstantSDNode>(Lane))
return SDValue();
SDValue Vec = Op.getOperand(0);
+ EVT VT = Vec.getValueType();
+
+ if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
+ return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
+
if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
SDLoc dl(Op);
return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
@@ -7520,7 +8044,64 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
return Op;
}
-static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ EVT Op1VT = V1.getValueType();
+ EVT Op2VT = V2.getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ assert(Op1VT == Op2VT && "Operand types don't match!");
+ assert(VT.getScalarSizeInBits() == 1 &&
+ "Unexpected custom CONCAT_VECTORS lowering");
+ assert(ST->hasMVEIntegerOps() &&
+ "CONCAT_VECTORS lowering only supported for MVE");
+
+ SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
+ SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
+
+ // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
+ // promoted to v8i16, etc.
+
+ MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
+
+ // Extract the vector elements from Op1 and Op2 one by one and truncate them
+ // to be the right size for the destination. For example, if Op1 is v4i1 then
+ // the promoted vector is v4i32. The result of concatentation gives a v8i1,
+ // which when promoted is v8i16. That means each i32 element from Op1 needs
+ // truncating to i16 and inserting in the result.
+ EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
+ SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
+ auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
+ EVT NewVT = NewV.getValueType();
+ EVT ConcatVT = ConVec.getValueType();
+ for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
+ DAG.getIntPtrConstant(i, dl));
+ ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
+ DAG.getConstant(j, dl, MVT::i32));
+ }
+ return ConVec;
+ };
+ unsigned j = 0;
+ ConVec = ExractInto(NewV1, ConVec, j);
+ ConVec = ExractInto(NewV2, ConVec, j);
+
+ // Now return the result of comparing the subvector with zero,
+ // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+ return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
+ DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VT = Op->getValueType(0);
+ if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
+ return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
+
// The only time a CONCAT_VECTORS operation can have legal types is when
// two 64-bit vectors are concatenated to a 128-bit vector.
assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
@@ -7540,6 +8121,43 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
}
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ EVT Op1VT = V1.getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue();
+
+ assert(VT.getScalarSizeInBits() == 1 &&
+ "Unexpected custom EXTRACT_SUBVECTOR lowering");
+ assert(ST->hasMVEIntegerOps() &&
+ "EXTRACT_SUBVECTOR lowering only supported for MVE");
+
+ SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
+
+ // We now have Op1 promoted to a vector of integers, where v8i1 gets
+ // promoted to v8i16, etc.
+
+ MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
+
+ EVT SubVT = MVT::getVectorVT(ElType, NumElts);
+ SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
+ for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
+ DAG.getIntPtrConstant(i, dl));
+ SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
+ DAG.getConstant(j, dl, MVT::i32));
+ }
+
+ // Now return the result of comparing the subvector with zero,
+ // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+ return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
+ DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+}
+
/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
/// element has been zero/sign-extended, depending on the isSigned parameter,
/// from an integer type half its size.
@@ -7897,7 +8515,8 @@ static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
return N0;
}
-static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
EVT VT = Op.getValueType();
assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
"unexpected type for custom-lowering ISD::SDIV");
@@ -7924,7 +8543,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
- N0 = LowerCONCAT_VECTORS(N0, DAG);
+ N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
return N0;
@@ -7932,7 +8551,8 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
return LowerSDIV_v4i16(N0, N1, dl, DAG);
}
-static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
// TODO: Should this propagate fast-math-flags?
EVT VT = Op.getValueType();
assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
@@ -7960,7 +8580,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
- N0 = LowerCONCAT_VECTORS(N0, DAG);
+ N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
@@ -8255,6 +8875,96 @@ void ARMTargetLowering::ExpandDIV_Windows(
Results.push_back(Upper);
}
+static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
+ LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
+ EVT MemVT = LD->getMemoryVT();
+ assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
+ "Expected a predicate type!");
+ assert(MemVT == Op.getValueType());
+ assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Expected a non-extending load");
+ assert(LD->isUnindexed() && "Expected a unindexed load");
+
+ // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit
+ // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
+ // need to make sure that 8/4 bits are actually loaded into the correct
+ // place, which means loading the value and then shuffling the values into
+ // the bottom bits of the predicate.
+ // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
+ // for BE).
+
+ SDLoc dl(Op);
+ SDValue Load = DAG.getExtLoad(
+ ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
+ EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
+ LD->getMemOperand());
+ SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load);
+ if (MemVT != MVT::v16i1)
+ Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
+ DAG.getConstant(0, dl, MVT::i32));
+ return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
+}
+
+static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
+ StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+ EVT MemVT = ST->getMemoryVT();
+ assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
+ "Expected a predicate type!");
+ assert(MemVT == ST->getValue().getValueType());
+ assert(!ST->isTruncatingStore() && "Expected a non-extending store");
+ assert(ST->isUnindexed() && "Expected a unindexed store");
+
+ // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits
+ // unset and a scalar store.
+ SDLoc dl(Op);
+ SDValue Build = ST->getValue();
+ if (MemVT != MVT::v16i1) {
+ SmallVector<SDValue, 16> Ops;
+ for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++)
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
+ DAG.getConstant(I, dl, MVT::i32)));
+ for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
+ Ops.push_back(DAG.getUNDEF(MVT::i32));
+ Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
+ }
+ SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
+ return DAG.getTruncStore(
+ ST->getChain(), dl, GRP, ST->getBasePtr(),
+ EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
+ ST->getMemOperand());
+}
+
+static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
+ MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+ MVT VT = Op.getSimpleValueType();
+ SDValue Mask = N->getMask();
+ SDValue PassThru = N->getPassThru();
+ SDLoc dl(Op);
+
+ auto IsZero = [](SDValue PassThru) {
+ return (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
+ (PassThru->getOpcode() == ARMISD::VMOVIMM &&
+ isNullConstant(PassThru->getOperand(0))));
+ };
+
+ if (IsZero(PassThru))
+ return Op;
+
+ // MVE Masked loads use zero as the passthru value. Here we convert undef to
+ // zero too, and other values are lowered to a select.
+ SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+ DAG.getTargetConstant(0, dl, MVT::i32));
+ SDValue NewLoad = DAG.getMaskedLoad(
+ VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
+ N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
+ SDValue Combo = NewLoad;
+ if (!PassThru.isUndef() &&
+ (PassThru.getOpcode() != ISD::BITCAST ||
+ !IsZero(PassThru->getOperand(0))))
+ Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
+ return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
+}
+
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
// Acquire/Release load/store is not legal for targets without a dmb or
@@ -8273,12 +8983,12 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
// Under Power Management extensions, the cycle-count is:
// mrc p15, #0, <Rt>, c9, c13, #0
SDValue Ops[] = { N->getOperand(0), // Chain
- DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
- DAG.getConstant(15, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(9, DL, MVT::i32),
- DAG.getConstant(13, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32)
+ DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+ DAG.getTargetConstant(15, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32),
+ DAG.getTargetConstant(9, DL, MVT::i32),
+ DAG.getTargetConstant(13, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32)
};
SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
@@ -8412,6 +9122,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
+ case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
Subtarget);
case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
@@ -8426,24 +9137,25 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
- case ISD::SETCC: return LowerVSETCC(Op, DAG);
+ case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
+ case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
- case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
- case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::SDIV:
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
return LowerDIV_Windows(Op, DAG, /* Signed */ true);
- return LowerSDIV(Op, DAG);
+ return LowerSDIV(Op, DAG, Subtarget);
case ISD::UDIV:
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
return LowerDIV_Windows(Op, DAG, /* Signed */ false);
- return LowerUDIV(Op, DAG);
+ return LowerUDIV(Op, DAG, Subtarget);
case ISD::ADDCARRY:
case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
case ISD::SADDO:
@@ -8452,6 +9164,15 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UADDO:
case ISD::USUBO:
return LowerUnsignedALUO(Op, DAG);
+ case ISD::SADDSAT:
+ case ISD::SSUBSAT:
+ return LowerSADDSUBSAT(Op, DAG, Subtarget);
+ case ISD::LOAD:
+ return LowerPredicateLoad(Op, DAG);
+ case ISD::STORE:
+ return LowerPredicateStore(Op, DAG);
+ case ISD::MLOAD:
+ return LowerMLOAD(Op, DAG);
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
@@ -8530,6 +9251,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Res.getValue(0));
Results.push_back(Res.getValue(1));
return;
+ case ISD::SADDSAT:
+ case ISD::SSUBSAT:
+ Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
+ break;
case ISD::READCYCLECOUNTER:
ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
return;
@@ -8600,19 +9325,19 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
// orr r5, r5, #1
// add r5, pc
// str r5, [$jbuf, #+4] ; &jbuf[1]
- unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ Register NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
.addConstantPoolIndex(CPI)
.addMemOperand(CPMMO)
.add(predOps(ARMCC::AL));
// Set the low bit because of thumb mode.
- unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+ Register NewVReg2 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
.addReg(NewVReg1, RegState::Kill)
.addImm(0x01)
.add(predOps(ARMCC::AL))
.add(condCodeOp());
- unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ Register NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
.addReg(NewVReg2, RegState::Kill)
.addImm(PCLabelId);
@@ -8630,28 +9355,28 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
// orrs r1, r2
// add r2, $jbuf, #+4 ; &jbuf[1]
// str r1, [r2]
- unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ Register NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
.addConstantPoolIndex(CPI)
.addMemOperand(CPMMO)
.add(predOps(ARMCC::AL));
- unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+ Register NewVReg2 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
.addReg(NewVReg1, RegState::Kill)
.addImm(PCLabelId);
// Set the low bit because of thumb mode.
- unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ Register NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
.addReg(ARM::CPSR, RegState::Define)
.addImm(1)
.add(predOps(ARMCC::AL));
- unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+ Register NewVReg4 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
.addReg(ARM::CPSR, RegState::Define)
.addReg(NewVReg2, RegState::Kill)
.addReg(NewVReg3, RegState::Kill)
.add(predOps(ARMCC::AL));
- unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+ Register NewVReg5 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
.addFrameIndex(FI)
.addImm(36); // &jbuf[1] :: pc
@@ -8666,13 +9391,13 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
// ldr r1, LCPI1_1
// add r1, pc, r1
// str r1, [$jbuf, #+4] ; &jbuf[1]
- unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ Register NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
.addConstantPoolIndex(CPI)
.addImm(0)
.addMemOperand(CPMMO)
.add(predOps(ARMCC::AL));
- unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+ Register NewVReg2 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
.addReg(NewVReg1, RegState::Kill)
.addImm(PCLabelId)
@@ -8794,7 +9519,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
bool IsPositionIndependent = isPositionIndependent();
unsigned NumLPads = LPadList.size();
if (Subtarget->isThumb2()) {
- unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ Register NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
.addFrameIndex(FI)
.addImm(4)
@@ -8807,7 +9532,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addImm(LPadList.size())
.add(predOps(ARMCC::AL));
} else {
- unsigned VReg1 = MRI->createVirtualRegister(TRC);
+ Register VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
.addImm(NumLPads & 0xFFFF)
.add(predOps(ARMCC::AL));
@@ -8832,12 +9557,12 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addImm(ARMCC::HI)
.addReg(ARM::CPSR);
- unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ Register NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
.addJumpTableIndex(MJTI)
.add(predOps(ARMCC::AL));
- unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+ Register NewVReg4 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
.addReg(NewVReg3, RegState::Kill)
.addReg(NewVReg1)
@@ -8850,7 +9575,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addReg(NewVReg1)
.addJumpTableIndex(MJTI);
} else if (Subtarget->isThumb()) {
- unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ Register NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
.addFrameIndex(FI)
.addImm(1)
@@ -8873,7 +9598,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
Align = MF->getDataLayout().getTypeAllocSize(C->getType());
unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
- unsigned VReg1 = MRI->createVirtualRegister(TRC);
+ Register VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
.addReg(VReg1, RegState::Define)
.addConstantPoolIndex(Idx)
@@ -8889,19 +9614,19 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addImm(ARMCC::HI)
.addReg(ARM::CPSR);
- unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+ Register NewVReg2 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
.addReg(ARM::CPSR, RegState::Define)
.addReg(NewVReg1)
.addImm(2)
.add(predOps(ARMCC::AL));
- unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ Register NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
.addJumpTableIndex(MJTI)
.add(predOps(ARMCC::AL));
- unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+ Register NewVReg4 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
.addReg(ARM::CPSR, RegState::Define)
.addReg(NewVReg2, RegState::Kill)
@@ -8911,7 +9636,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
- unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+ Register NewVReg5 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
.addReg(NewVReg4, RegState::Kill)
.addImm(0)
@@ -8932,7 +9657,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addReg(NewVReg6, RegState::Kill)
.addJumpTableIndex(MJTI);
} else {
- unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+ Register NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
.addFrameIndex(FI)
.addImm(4)
@@ -8945,7 +9670,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addImm(NumLPads)
.add(predOps(ARMCC::AL));
} else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
- unsigned VReg1 = MRI->createVirtualRegister(TRC);
+ Register VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
.addImm(NumLPads & 0xFFFF)
.add(predOps(ARMCC::AL));
@@ -8974,7 +9699,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
Align = MF->getDataLayout().getTypeAllocSize(C->getType());
unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
- unsigned VReg1 = MRI->createVirtualRegister(TRC);
+ Register VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
.addReg(VReg1, RegState::Define)
.addConstantPoolIndex(Idx)
@@ -8991,20 +9716,20 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addImm(ARMCC::HI)
.addReg(ARM::CPSR);
- unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+ Register NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
.addReg(NewVReg1)
.addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
.add(predOps(ARMCC::AL))
.add(condCodeOp());
- unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+ Register NewVReg4 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
.addJumpTableIndex(MJTI)
.add(predOps(ARMCC::AL));
MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
- unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+ Register NewVReg5 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
.addReg(NewVReg3, RegState::Kill)
.addReg(NewVReg4)
@@ -9239,8 +9964,8 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction::iterator It = ++BB->getIterator();
- unsigned dest = MI.getOperand(0).getReg();
- unsigned src = MI.getOperand(1).getReg();
+ Register dest = MI.getOperand(0).getReg();
+ Register src = MI.getOperand(1).getReg();
unsigned SizeVal = MI.getOperand(2).getImm();
unsigned Align = MI.getOperand(3).getImm();
DebugLoc dl = MI.getDebugLoc();
@@ -9291,9 +10016,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
unsigned srcIn = src;
unsigned destIn = dest;
for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
- unsigned srcOut = MRI.createVirtualRegister(TRC);
- unsigned destOut = MRI.createVirtualRegister(TRC);
- unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+ Register srcOut = MRI.createVirtualRegister(TRC);
+ Register destOut = MRI.createVirtualRegister(TRC);
+ Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
IsThumb1, IsThumb2);
emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
@@ -9306,9 +10031,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
// [scratch, srcOut] = LDRB_POST(srcIn, 1)
// [destOut] = STRB_POST(scratch, destIn, 1)
for (unsigned i = 0; i < BytesLeft; i++) {
- unsigned srcOut = MRI.createVirtualRegister(TRC);
- unsigned destOut = MRI.createVirtualRegister(TRC);
- unsigned scratch = MRI.createVirtualRegister(TRC);
+ Register srcOut = MRI.createVirtualRegister(TRC);
+ Register destOut = MRI.createVirtualRegister(TRC);
+ Register scratch = MRI.createVirtualRegister(TRC);
emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
IsThumb1, IsThumb2);
emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
@@ -9351,7 +10076,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
// Load an immediate to varEnd.
- unsigned varEnd = MRI.createVirtualRegister(TRC);
+ Register varEnd = MRI.createVirtualRegister(TRC);
if (Subtarget->useMovt()) {
unsigned Vtmp = varEnd;
if ((LoopSize & 0xFFFF0000) != 0)
@@ -9401,12 +10126,12 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
// destPhi = PHI(destLoop, dst)
MachineBasicBlock *entryBB = BB;
BB = loopMBB;
- unsigned varLoop = MRI.createVirtualRegister(TRC);
- unsigned varPhi = MRI.createVirtualRegister(TRC);
- unsigned srcLoop = MRI.createVirtualRegister(TRC);
- unsigned srcPhi = MRI.createVirtualRegister(TRC);
- unsigned destLoop = MRI.createVirtualRegister(TRC);
- unsigned destPhi = MRI.createVirtualRegister(TRC);
+ Register varLoop = MRI.createVirtualRegister(TRC);
+ Register varPhi = MRI.createVirtualRegister(TRC);
+ Register srcLoop = MRI.createVirtualRegister(TRC);
+ Register srcPhi = MRI.createVirtualRegister(TRC);
+ Register destLoop = MRI.createVirtualRegister(TRC);
+ Register destPhi = MRI.createVirtualRegister(TRC);
BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
.addReg(varLoop).addMBB(loopMBB)
@@ -9420,7 +10145,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
// [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
// [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
- unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+ Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
IsThumb1, IsThumb2);
emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
@@ -9461,9 +10186,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
unsigned srcIn = srcLoop;
unsigned destIn = destLoop;
for (unsigned i = 0; i < BytesLeft; i++) {
- unsigned srcOut = MRI.createVirtualRegister(TRC);
- unsigned destOut = MRI.createVirtualRegister(TRC);
- unsigned scratch = MRI.createVirtualRegister(TRC);
+ Register srcOut = MRI.createVirtualRegister(TRC);
+ Register destOut = MRI.createVirtualRegister(TRC);
+ Register scratch = MRI.createVirtualRegister(TRC);
emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
IsThumb1, IsThumb2);
emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
@@ -9523,7 +10248,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
break;
case CodeModel::Large: {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+ Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
.addExternalSymbol("__chkstk");
@@ -9771,8 +10496,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// equality.
bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
- unsigned LHS1 = MI.getOperand(1).getReg();
- unsigned LHS2 = MI.getOperand(2).getReg();
+ Register LHS1 = MI.getOperand(1).getReg();
+ Register LHS2 = MI.getOperand(2).getReg();
if (RHSisZero) {
BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
.addReg(LHS1)
@@ -9782,8 +10507,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addReg(LHS2).addImm(0)
.addImm(ARMCC::EQ).addReg(ARM::CPSR);
} else {
- unsigned RHS1 = MI.getOperand(3).getReg();
- unsigned RHS2 = MI.getOperand(4).getReg();
+ Register RHS1 = MI.getOperand(3).getReg();
+ Register RHS2 = MI.getOperand(4).getReg();
BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
.addReg(LHS1)
.addReg(RHS1)
@@ -9844,15 +10569,15 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
Fn->insert(BBI, RSBBB);
Fn->insert(BBI, SinkBB);
- unsigned int ABSSrcReg = MI.getOperand(1).getReg();
- unsigned int ABSDstReg = MI.getOperand(0).getReg();
+ Register ABSSrcReg = MI.getOperand(1).getReg();
+ Register ABSDstReg = MI.getOperand(0).getReg();
bool ABSSrcKIll = MI.getOperand(1).isKill();
bool isThumb2 = Subtarget->isThumb2();
MachineRegisterInfo &MRI = Fn->getRegInfo();
// In Thumb mode S must not be specified if source register is the SP or
// PC and if destination register is the SP, so restrict register class
- unsigned NewRsbDstReg =
- MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
+ Register NewRsbDstReg = MRI.createVirtualRegister(
+ isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
// Transfer the remainder of BB and its successor edges to sinkMBB.
SinkBB->splice(SinkBB->begin(), BB,
@@ -9931,7 +10656,7 @@ static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
// The MEMCPY both defines and kills the scratch registers.
for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
- unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
+ Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
: &ARM::GPRRegClass);
MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
}
@@ -10369,10 +11094,7 @@ static SDValue findMUL_LOHI(SDValue V) {
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
- if (Subtarget->isThumb()) {
- if (!Subtarget->hasDSP())
- return SDValue();
- } else if (!Subtarget->hasV5TEOps())
+ if (!Subtarget->hasBaseDSP())
return SDValue();
// SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
@@ -11253,7 +11975,7 @@ static SDValue PerformANDCombine(SDNode *N,
BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
if (SplatBitSize <= 64) {
EVT VbicVT;
- SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
+ SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VbicVT, VT.is128BitVector(),
OtherModImm);
@@ -11469,6 +12191,77 @@ static SDValue PerformORCombineToBFI(SDNode *N,
return SDValue();
}
+static bool isValidMVECond(unsigned CC, bool IsFloat) {
+ switch (CC) {
+ case ARMCC::EQ:
+ case ARMCC::NE:
+ case ARMCC::LE:
+ case ARMCC::GT:
+ case ARMCC::GE:
+ case ARMCC::LT:
+ return true;
+ case ARMCC::HS:
+ case ARMCC::HI:
+ return !IsFloat;
+ default:
+ return false;
+ };
+}
+
+static SDValue PerformORCombine_i1(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
+ // together with predicates
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ ARMCC::CondCodes CondCode0 = ARMCC::AL;
+ ARMCC::CondCodes CondCode1 = ARMCC::AL;
+ if (N0->getOpcode() == ARMISD::VCMP)
+ CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2))
+ ->getZExtValue();
+ else if (N0->getOpcode() == ARMISD::VCMPZ)
+ CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1))
+ ->getZExtValue();
+ if (N1->getOpcode() == ARMISD::VCMP)
+ CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2))
+ ->getZExtValue();
+ else if (N1->getOpcode() == ARMISD::VCMPZ)
+ CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1))
+ ->getZExtValue();
+
+ if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL)
+ return SDValue();
+
+ unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0);
+ unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1);
+
+ if (!isValidMVECond(Opposite0,
+ N0->getOperand(0)->getValueType(0).isFloatingPoint()) ||
+ !isValidMVECond(Opposite1,
+ N1->getOperand(0)->getValueType(0).isFloatingPoint()))
+ return SDValue();
+
+ SmallVector<SDValue, 4> Ops0;
+ Ops0.push_back(N0->getOperand(0));
+ if (N0->getOpcode() == ARMISD::VCMP)
+ Ops0.push_back(N0->getOperand(1));
+ Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32));
+ SmallVector<SDValue, 4> Ops1;
+ Ops1.push_back(N1->getOperand(0));
+ if (N1->getOpcode() == ARMISD::VCMP)
+ Ops1.push_back(N1->getOperand(1));
+ Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32));
+
+ SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0);
+ SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1);
+ SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1);
+ return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And,
+ DCI.DAG.getAllOnesConstant(SDLoc(N), VT));
+}
+
/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
static SDValue PerformORCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
@@ -11489,7 +12282,7 @@ static SDValue PerformORCombine(SDNode *N,
BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
if (SplatBitSize <= 64) {
EVT VorrVT;
- SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+ SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VorrVT, VT.is128BitVector(),
OtherModImm);
@@ -11553,6 +12346,10 @@ static SDValue PerformORCombine(SDNode *N,
}
}
+ if (Subtarget->hasMVEIntegerOps() &&
+ (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
+ return PerformORCombine_i1(N, DCI, Subtarget);
+
// Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
// reasonable.
if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
@@ -11921,6 +12718,24 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return Vec;
}
+static SDValue
+PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+ SDValue Op = N->getOperand(0);
+ SDLoc dl(N);
+
+ // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
+ if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
+ // If the valuetypes are the same, we can remove the cast entirely.
+ if (Op->getOperand(0).getValueType() == VT)
+ return Op->getOperand(0);
+ return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl,
+ Op->getOperand(0).getValueType(), Op->getOperand(0));
+ }
+
+ return SDValue();
+}
+
/// PerformInsertEltCombine - Target-specific dag combine xforms for
/// ISD::INSERT_VECTOR_ELT.
static SDValue PerformInsertEltCombine(SDNode *N,
@@ -12332,7 +13147,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
// The canonical VMOV for a zero vector uses a 32-bit element size.
unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
unsigned EltBits;
- if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
+ if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
EltSize = 8;
EVT VT = N->getValueType(0);
if (EltSize > VT.getScalarSizeInBits())
@@ -12382,95 +13197,163 @@ static SDValue PerformLOADCombine(SDNode *N,
return SDValue();
}
-/// PerformSTORECombine - Target-specific dag combine xforms for
-/// ISD::STORE.
-static SDValue PerformSTORECombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- StoreSDNode *St = cast<StoreSDNode>(N);
- if (St->isVolatile())
- return SDValue();
-
- // Optimize trunc store (of multiple scalars) to shuffle and store. First,
- // pack all of the elements in one place. Next, store to memory in fewer
- // chunks.
+// Optimize trunc store (of multiple scalars) to shuffle and store. First,
+// pack all of the elements in one place. Next, store to memory in fewer
+// chunks.
+static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,
+ SelectionDAG &DAG) {
SDValue StVal = St->getValue();
EVT VT = StVal.getValueType();
- if (St->isTruncatingStore() && VT.isVector()) {
- SelectionDAG &DAG = DCI.DAG;
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- EVT StVT = St->getMemoryVT();
- unsigned NumElems = VT.getVectorNumElements();
- assert(StVT != VT && "Cannot truncate to the same type");
- unsigned FromEltSz = VT.getScalarSizeInBits();
- unsigned ToEltSz = StVT.getScalarSizeInBits();
+ if (!St->isTruncatingStore() || !VT.isVector())
+ return SDValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT StVT = St->getMemoryVT();
+ unsigned NumElems = VT.getVectorNumElements();
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromEltSz = VT.getScalarSizeInBits();
+ unsigned ToEltSz = StVT.getScalarSizeInBits();
+
+ // From, To sizes and ElemCount must be pow of two
+ if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
+ return SDValue();
- // From, To sizes and ElemCount must be pow of two
- if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ if (0 != (NumElems * FromEltSz) % ToEltSz)
+ return SDValue();
- // We are going to use the original vector elt for storing.
- // Accumulated smaller vector elements must be a multiple of the store size.
- if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
+ unsigned SizeRatio = FromEltSz / ToEltSz;
+ assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
- unsigned SizeRatio = FromEltSz / ToEltSz;
- assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
+ // Create a type on which we perform the shuffle.
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
+ NumElems * SizeRatio);
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
- // Create a type on which we perform the shuffle.
- EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
- NumElems*SizeRatio);
- assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+ SDLoc DL(St);
+ SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i < NumElems; ++i)
+ ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
+ : i * SizeRatio;
- SDLoc DL(St);
- SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
- SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i < NumElems; ++i)
- ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
- ? (i + 1) * SizeRatio - 1
- : i * SizeRatio;
-
- // Can't shuffle using an illegal type.
- if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
-
- SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
- DAG.getUNDEF(WideVec.getValueType()),
- ShuffleVec);
- // At this point all of the data is stored at the bottom of the
- // register. We now need to save it to mem.
-
- // Find the largest store unit
- MVT StoreType = MVT::i8;
- for (MVT Tp : MVT::integer_valuetypes()) {
- if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
- StoreType = Tp;
- }
- // Didn't find a legal store type.
- if (!TLI.isTypeLegal(StoreType))
- return SDValue();
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
- // Bitcast the original vector into a vector of store-size units
- EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
- StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
- assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
- SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
- SmallVector<SDValue, 8> Chains;
- SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
- TLI.getPointerTy(DAG.getDataLayout()));
- SDValue BasePtr = St->getBasePtr();
+ SDValue Shuff = DAG.getVectorShuffle(
+ WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
+ // At this point all of the data is stored at the bottom of the
+ // register. We now need to save it to mem.
- // Perform one or more big stores into memory.
- unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
- for (unsigned I = 0; I < E; I++) {
- SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
- StoreType, ShuffWide,
- DAG.getIntPtrConstant(I, DL));
- SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
- St->getPointerInfo(), St->getAlignment(),
- St->getMemOperand()->getFlags());
- BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
- Increment);
- Chains.push_back(Ch);
- }
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ // Find the largest store unit
+ MVT StoreType = MVT::i8;
+ for (MVT Tp : MVT::integer_valuetypes()) {
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
+ StoreType = Tp;
}
+ // Didn't find a legal store type.
+ if (!TLI.isTypeLegal(StoreType))
+ return SDValue();
+
+ // Bitcast the original vector into a vector of store-size units
+ EVT StoreVecVT =
+ EVT::getVectorVT(*DAG.getContext(), StoreType,
+ VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
+ assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+ SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
+ SmallVector<SDValue, 8> Chains;
+ SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
+ TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue BasePtr = St->getBasePtr();
+
+ // Perform one or more big stores into memory.
+ unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
+ for (unsigned I = 0; I < E; I++) {
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
+ ShuffWide, DAG.getIntPtrConstant(I, DL));
+ SDValue Ch =
+ DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
+ St->getAlignment(), St->getMemOperand()->getFlags());
+ BasePtr =
+ DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
+ Chains.push_back(Ch);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+}
+
+// Try taking a single vector store from an truncate (which would otherwise turn
+// into an expensive buildvector) and splitting it into a series of narrowing
+// stores.
+static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
+ SelectionDAG &DAG) {
+ if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
+ return SDValue();
+ SDValue Trunc = St->getValue();
+ if (Trunc->getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+ EVT FromVT = Trunc->getOperand(0).getValueType();
+ EVT ToVT = Trunc.getValueType();
+ if (!ToVT.isVector())
+ return SDValue();
+ assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
+ EVT ToEltVT = ToVT.getVectorElementType();
+ EVT FromEltVT = FromVT.getVectorElementType();
+
+ unsigned NumElements = 0;
+ if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8))
+ NumElements = 4;
+ if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8)
+ NumElements = 8;
+ if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements ||
+ FromVT.getVectorNumElements() % NumElements != 0)
+ return SDValue();
+
+ SDLoc DL(St);
+ // Details about the old store
+ SDValue Ch = St->getChain();
+ SDValue BasePtr = St->getBasePtr();
+ unsigned Alignment = St->getOriginalAlignment();
+ MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
+ AAMDNodes AAInfo = St->getAAInfo();
+
+ EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements);
+ EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements);
+
+ SmallVector<SDValue, 4> Stores;
+ for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
+ unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
+ SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
+ DAG.getConstant(i * NumElements, DL, MVT::i32));
+ SDValue Store = DAG.getTruncStore(
+ Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
+ NewToVT, Alignment, MMOFlags, AAInfo);
+ Stores.push_back(Store);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+}
+
+/// PerformSTORECombine - Target-specific dag combine xforms for
+/// ISD::STORE.
+static SDValue PerformSTORECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ StoreSDNode *St = cast<StoreSDNode>(N);
+ if (St->isVolatile())
+ return SDValue();
+ SDValue StVal = St->getValue();
+ EVT VT = StVal.getValueType();
+
+ if (Subtarget->hasNEON())
+ if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
+ return Store;
+
+ if (Subtarget->hasMVEIntegerOps())
+ if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
+ return NewToken;
if (!ISD::isNormalStore(St))
return SDValue();
@@ -12522,7 +13405,7 @@ static SDValue PerformSTORECombine(SDNode *N,
}
// If this is a legal vector store, try to combine it into a VST1_UPD.
- if (ISD::isNormalStore(N) && VT.isVector() &&
+ if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
return CombineBaseUpdate(N, DCI);
@@ -12890,6 +13773,71 @@ static SDValue PerformShiftCombine(SDNode *N,
return SDValue();
}
+// Look for a sign/zero extend of a larger than legal load. This can be split
+// into two extending loads, which are simpler to deal with than an arbitrary
+// sign extend.
+static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ if (N0.getOpcode() != ISD::LOAD)
+ return SDValue();
+ LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());
+ if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
+ LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return SDValue();
+ EVT FromVT = LD->getValueType(0);
+ EVT ToVT = N->getValueType(0);
+ if (!ToVT.isVector())
+ return SDValue();
+ assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
+ EVT ToEltVT = ToVT.getVectorElementType();
+ EVT FromEltVT = FromVT.getVectorElementType();
+
+ unsigned NumElements = 0;
+ if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
+ NumElements = 4;
+ if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
+ NumElements = 8;
+ if (NumElements == 0 ||
+ FromVT.getVectorNumElements() == NumElements ||
+ FromVT.getVectorNumElements() % NumElements != 0 ||
+ !isPowerOf2_32(NumElements))
+ return SDValue();
+
+ SDLoc DL(LD);
+ // Details about the old load
+ SDValue Ch = LD->getChain();
+ SDValue BasePtr = LD->getBasePtr();
+ unsigned Alignment = LD->getOriginalAlignment();
+ MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+ AAMDNodes AAInfo = LD->getAAInfo();
+
+ ISD::LoadExtType NewExtType =
+ N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
+ EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext());
+ EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
+ unsigned NewOffset = NewFromVT.getSizeInBits() / 8;
+ SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+
+ // Split the load in half, each side of which is extended separately. This
+ // is good enough, as legalisation will take it from there. They are either
+ // already legal or they will be split further into something that is
+ // legal.
+ SDValue NewLoad1 =
+ DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset,
+ LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo);
+ SDValue NewLoad2 =
+ DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
+ LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
+ Alignment, MMOFlags, AAInfo);
+
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ SDValue(NewLoad1.getNode(), 1),
+ SDValue(NewLoad2.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2);
+}
+
/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
@@ -12927,6 +13875,10 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ if (ST->hasMVEIntegerOps())
+ if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
+ return NewLoad;
+
return SDValue();
}
@@ -13028,43 +13980,169 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
return V;
}
+// Given N, the value controlling the conditional branch, search for the loop
+// intrinsic, returning it, along with how the value is used. We need to handle
+// patterns such as the following:
+// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
+// (brcond (setcc (loop.decrement), 0, eq), exit)
+// (brcond (setcc (loop.decrement), 0, ne), header)
+static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
+ bool &Negate) {
+ switch (N->getOpcode()) {
+ default:
+ break;
+ case ISD::XOR: {
+ if (!isa<ConstantSDNode>(N.getOperand(1)))
+ return SDValue();
+ if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
+ return SDValue();
+ Negate = !Negate;
+ return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
+ }
+ case ISD::SETCC: {
+ auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!Const)
+ return SDValue();
+ if (Const->isNullValue())
+ Imm = 0;
+ else if (Const->isOne())
+ Imm = 1;
+ else
+ return SDValue();
+ CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
+ return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
+ if (IntOp != Intrinsic::test_set_loop_iterations &&
+ IntOp != Intrinsic::loop_decrement_reg)
+ return SDValue();
+ return N;
+ }
+ }
+ return SDValue();
+}
+
static SDValue PerformHWLoopCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
- // Look for (brcond (xor test.set.loop.iterations, -1)
- SDValue CC = N->getOperand(1);
- unsigned Opc = CC->getOpcode();
- SDValue Int;
- if ((Opc == ISD::XOR || Opc == ISD::SETCC) &&
- (CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) {
+ // The hwloop intrinsics that we're interested are used for control-flow,
+ // either for entering or exiting the loop:
+ // - test.set.loop.iterations will test whether its operand is zero. If it
+ // is zero, the proceeding branch should not enter the loop.
+ // - loop.decrement.reg also tests whether its operand is zero. If it is
+ // zero, the proceeding branch should not branch back to the beginning of
+ // the loop.
+ // So here, we need to check that how the brcond is using the result of each
+ // of the intrinsics to ensure that we're branching to the right place at the
+ // right time.
+
+ ISD::CondCode CC;
+ SDValue Cond;
+ int Imm = 1;
+ bool Negate = false;
+ SDValue Chain = N->getOperand(0);
+ SDValue Dest;
- assert((isa<ConstantSDNode>(CC->getOperand(1)) &&
- cast<ConstantSDNode>(CC->getOperand(1))->isOne()) &&
- "Expected to compare against 1");
+ if (N->getOpcode() == ISD::BRCOND) {
+ CC = ISD::SETEQ;
+ Cond = N->getOperand(1);
+ Dest = N->getOperand(2);
+ } else {
+ assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
+ CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+ Cond = N->getOperand(2);
+ Dest = N->getOperand(4);
+ if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
+ if (!Const->isOne() && !Const->isNullValue())
+ return SDValue();
+ Imm = Const->getZExtValue();
+ } else
+ return SDValue();
+ }
- Int = CC->getOperand(0);
- } else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN)
- Int = CC;
- else
+ SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
+ if (!Int)
return SDValue();
- unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue();
- if (IntOp != Intrinsic::test_set_loop_iterations)
- return SDValue();
+ if (Negate)
+ CC = ISD::getSetCCInverse(CC, true);
+
+ auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
+ return (CC == ISD::SETEQ && Imm == 0) ||
+ (CC == ISD::SETNE && Imm == 1) ||
+ (CC == ISD::SETLT && Imm == 1) ||
+ (CC == ISD::SETULT && Imm == 1);
+ };
+
+ auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
+ return (CC == ISD::SETEQ && Imm == 1) ||
+ (CC == ISD::SETNE && Imm == 0) ||
+ (CC == ISD::SETGT && Imm == 0) ||
+ (CC == ISD::SETUGT && Imm == 0) ||
+ (CC == ISD::SETGE && Imm == 1) ||
+ (CC == ISD::SETUGE && Imm == 1);
+ };
+
+ assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
+ "unsupported condition");
SDLoc dl(Int);
- SDValue Chain = N->getOperand(0);
+ SelectionDAG &DAG = DCI.DAG;
SDValue Elements = Int.getOperand(2);
- SDValue ExitBlock = N->getOperand(2);
+ unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
+ assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
+ && "expected single br user");
+ SDNode *Br = *N->use_begin();
+ SDValue OtherTarget = Br->getOperand(1);
+
+ // Update the unconditional branch to branch to the given Dest.
+ auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
+ SDValue NewBrOps[] = { Br->getOperand(0), Dest };
+ SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
+ };
- // TODO: Once we start supporting tail predication, we can add another
- // operand to WLS for the number of elements processed in a vector loop.
+ if (IntOp == Intrinsic::test_set_loop_iterations) {
+ SDValue Res;
+ // We expect this 'instruction' to branch when the counter is zero.
+ if (IsTrueIfZero(CC, Imm)) {
+ SDValue Ops[] = { Chain, Elements, Dest };
+ Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
+ } else {
+ // The logic is the reverse of what we need for WLS, so find the other
+ // basic block target: the target of the proceeding br.
+ UpdateUncondBr(Br, Dest, DAG);
- SDValue Ops[] = { Chain, Elements, ExitBlock };
- SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
- DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
- return Res;
+ SDValue Ops[] = { Chain, Elements, OtherTarget };
+ Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
+ }
+ DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
+ return Res;
+ } else {
+ SDValue Size = DAG.getTargetConstant(
+ cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
+ SDValue Args[] = { Int.getOperand(0), Elements, Size, };
+ SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
+ DAG.getVTList(MVT::i32, MVT::Other), Args);
+ DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
+
+ // We expect this instruction to branch when the count is not zero.
+ SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
+
+ // Update the unconditional branch to target the loop preheader if we've
+ // found the condition has been reversed.
+ if (Target == OtherTarget)
+ UpdateUncondBr(Br, Dest, DAG);
+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ SDValue(LoopDec.getNode(), 1), Chain);
+
+ SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
+ return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
+ }
+ return SDValue();
}
/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
@@ -13298,14 +14376,15 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
- case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget);
+ case ISD::BRCOND:
+ case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
case ARMISD::ADDC:
case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
case ARMISD::BFI: return PerformBFICombine(N, DCI);
case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
- case ISD::STORE: return PerformSTORECombine(N, DCI);
+ case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
@@ -13334,6 +14413,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return PerformVLDCombine(N, DCI);
case ARMISD::BUILD_VECTOR:
return PerformARMBUILD_VECTORCombine(N, DCI);
+ case ARMISD::PREDICATE_CAST:
+ return PerformPREDICATE_CASTCombine(N, DCI);
case ARMISD::SMULWB: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
@@ -13348,7 +14429,9 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
break;
}
- case ARMISD::SMLALBB: {
+ case ARMISD::SMLALBB:
+ case ARMISD::QADD16b:
+ case ARMISD::QSUB16b: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
@@ -13384,6 +14467,15 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
break;
}
+ case ARMISD::QADD8b:
+ case ARMISD::QSUB8b: {
+ unsigned BitWidth = N->getValueType(0).getSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
+ if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
+ (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
+ return SDValue();
+ break;
+ }
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -13457,47 +14549,38 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
if (!Subtarget->hasMVEIntegerOps())
return false;
- if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 &&
- Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 &&
- Ty != MVT::v2f64 &&
- // These are for truncated stores
- Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16)
- return false;
- if (Subtarget->isLittle()) {
- // In little-endian MVE, the store instructions VSTRB.U8,
- // VSTRH.U16 and VSTRW.U32 all store the vector register in
- // exactly the same format, and differ only in the range of
- // their immediate offset field and the required alignment.
- //
- // In particular, VSTRB.U8 can store a vector at byte alignment.
- // So at this stage we can simply say that loads/stores of all
- // 128-bit wide vector types are permitted at any alignment,
- // because we know at least _one_ instruction can manage that.
- //
- // Later on we might find that some of those loads are better
- // generated as VLDRW.U32 if alignment permits, to take
- // advantage of the larger immediate range. But for the moment,
- // all that matters is that if we don't lower the load then
- // _some_ instruction can handle it.
+ // These are for predicates
+ if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) {
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
+
+ // These are for truncated stores/narrowing loads. They are fine so long as
+ // the alignment is at least the size of the item being loaded
+ if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
+ Alignment >= VT.getScalarSizeInBits() / 8) {
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
+
+ // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
+ // VSTRW.U32 all store the vector register in exactly the same format, and
+ // differ only in the range of their immediate offset field and the required
+ // alignment. So there is always a store that can be used, regardless of
+ // actual type.
+ //
+ // For big endian, that is not the case. But can still emit a (VSTRB.U8;
+ // VREV64.8) pair and get the same effect. This will likely be better than
+ // aligning the vector through the stack.
+ if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
+ Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
+ Ty == MVT::v2f64) {
if (Fast)
*Fast = true;
return true;
- } else {
- // In big-endian MVE, those instructions aren't so similar
- // after all, because they reorder the bytes of the vector
- // differently. So this time we can only store a particular
- // kind of vector if its alignment is at least the element
- // type. And we can't store vectors of i64 or f64 at all
- // without having to do some postprocessing, because there's
- // no VSTRD.U64.
- if (Ty == MVT::v16i8 ||
- ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) ||
- ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) {
- if (Fast)
- *Fast = true;
- return true;
- }
}
return false;
@@ -13617,22 +14700,60 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) {
/// sext/zext can be folded into vsubl.
bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const {
- if (!Subtarget->hasNEON() || !I->getType()->isVectorTy())
+ if (!I->getType()->isVectorTy())
return false;
- switch (I->getOpcode()) {
- case Instruction::Sub:
- case Instruction::Add: {
- if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
+ if (Subtarget->hasNEON()) {
+ switch (I->getOpcode()) {
+ case Instruction::Sub:
+ case Instruction::Add: {
+ if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
+ return false;
+ Ops.push_back(&I->getOperandUse(0));
+ Ops.push_back(&I->getOperandUse(1));
+ return true;
+ }
+ default:
return false;
- Ops.push_back(&I->getOperandUse(0));
- Ops.push_back(&I->getOperandUse(1));
- return true;
+ }
}
- default:
+
+ if (!Subtarget->hasMVEIntegerOps())
+ return false;
+
+ auto IsSinker = [](Instruction *I, int Operand) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Mul:
+ return true;
+ case Instruction::Sub:
+ return Operand == 1;
+ default:
+ return false;
+ }
+ };
+
+ int Op = 0;
+ if (!isa<ShuffleVectorInst>(I->getOperand(Op)))
+ Op = 1;
+ if (!IsSinker(I, Op))
+ return false;
+ if (!match(I->getOperand(Op),
+ m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()),
+ m_Undef(), m_Zero()))) {
return false;
}
- return false;
+ Instruction *Shuffle = cast<Instruction>(I->getOperand(Op));
+ // All uses of the shuffle should be sunk to avoid duplicating it across gpr
+ // and vector registers
+ for (Use &U : Shuffle->uses()) {
+ Instruction *Insn = cast<Instruction>(U.getUser());
+ if (!IsSinker(Insn, U.getOperandNo()))
+ return false;
+ }
+ Ops.push_back(&Shuffle->getOperandUse(0));
+ Ops.push_back(&I->getOperandUse(Op));
+ return true;
}
bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
@@ -13641,6 +14762,11 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
if (!isTypeLegal(VT))
return false;
+ if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
+ if (Ld->isExpandingLoad())
+ return false;
+ }
+
// Don't create a loadext if we can fold the extension into a wide/long
// instruction.
// If there's more than one user instruction, the loadext is desirable no
@@ -14028,6 +15154,52 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
return false;
}
+static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
+ bool isSEXTLoad, bool isLE, SDValue &Base,
+ SDValue &Offset, bool &isInc,
+ SelectionDAG &DAG) {
+ if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+ return false;
+ if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
+ return false;
+
+ ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
+ int RHSC = (int)RHS->getZExtValue();
+
+ auto IsInRange = [&](int RHSC, int Limit, int Scale) {
+ if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
+ assert(Ptr->getOpcode() == ISD::ADD);
+ isInc = false;
+ Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
+ return true;
+ } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
+ isInc = Ptr->getOpcode() == ISD::ADD;
+ Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
+ return true;
+ }
+ return false;
+ };
+
+ // Try to find a matching instruction based on s/zext, Alignment, Offset and
+ // (in BE) type.
+ Base = Ptr->getOperand(0);
+ if (VT == MVT::v4i16) {
+ if (Align >= 2 && IsInRange(RHSC, 0x80, 2))
+ return true;
+ } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
+ if (IsInRange(RHSC, 0x80, 1))
+ return true;
+ } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) &&
+ IsInRange(RHSC, 0x80, 4))
+ return true;
+ else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) &&
+ IsInRange(RHSC, 0x80, 2))
+ return true;
+ else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
+ return true;
+ return false;
+}
+
/// getPreIndexedAddressParts - returns true by value, base pointer and
/// offset pointer and addressing mode by reference if the node's address
/// can be legally represented as pre-indexed load / store address.
@@ -14041,25 +15213,35 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
EVT VT;
SDValue Ptr;
+ unsigned Align;
bool isSEXTLoad = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
Ptr = LD->getBasePtr();
- VT = LD->getMemoryVT();
+ VT = LD->getMemoryVT();
+ Align = LD->getAlignment();
isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
Ptr = ST->getBasePtr();
- VT = ST->getMemoryVT();
+ VT = ST->getMemoryVT();
+ Align = ST->getAlignment();
} else
return false;
bool isInc;
bool isLegal = false;
- if (Subtarget->isThumb2())
- isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
- Offset, isInc, DAG);
- else
- isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
- Offset, isInc, DAG);
+ if (VT.isVector())
+ isLegal = Subtarget->hasMVEIntegerOps() &&
+ getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad,
+ Subtarget->isLittle(), Base, Offset,
+ isInc, DAG);
+ else {
+ if (Subtarget->isThumb2())
+ isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+ Offset, isInc, DAG);
+ else
+ isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+ Offset, isInc, DAG);
+ }
if (!isLegal)
return false;
@@ -14077,15 +15259,18 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
SelectionDAG &DAG) const {
EVT VT;
SDValue Ptr;
+ unsigned Align;
bool isSEXTLoad = false, isNonExt;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
- VT = LD->getMemoryVT();
+ VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
+ Align = LD->getAlignment();
isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
- VT = ST->getMemoryVT();
+ VT = ST->getMemoryVT();
Ptr = ST->getBasePtr();
+ Align = ST->getAlignment();
isNonExt = !ST->isTruncatingStore();
} else
return false;
@@ -14108,12 +15293,19 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
bool isInc;
bool isLegal = false;
- if (Subtarget->isThumb2())
- isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
- isInc, DAG);
- else
- isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+ if (VT.isVector())
+ isLegal = Subtarget->hasMVEIntegerOps() &&
+ getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad,
+ Subtarget->isLittle(), Base, Offset,
isInc, DAG);
+ else {
+ if (Subtarget->isThumb2())
+ isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+ isInc, DAG);
+ else
+ isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+ isInc, DAG);
+ }
if (!isLegal)
return false;
@@ -14369,7 +15561,8 @@ const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
/// constraint it is for this target.
ARMTargetLowering::ConstraintType
ARMTargetLowering::getConstraintType(StringRef Constraint) const {
- if (Constraint.size() == 1) {
+ unsigned S = Constraint.size();
+ if (S == 1) {
switch (Constraint[0]) {
default: break;
case 'l': return C_RegisterClass;
@@ -14377,12 +15570,12 @@ ARMTargetLowering::getConstraintType(StringRef Constraint) const {
case 'h': return C_RegisterClass;
case 'x': return C_RegisterClass;
case 't': return C_RegisterClass;
- case 'j': return C_Other; // Constant for movw.
- // An address with a single base register. Due to the way we
- // currently handle addresses it is the same as an 'r' memory constraint.
+ case 'j': return C_Immediate; // Constant for movw.
+ // An address with a single base register. Due to the way we
+ // currently handle addresses it is the same as an 'r' memory constraint.
case 'Q': return C_Memory;
}
- } else if (Constraint.size() == 2) {
+ } else if (S == 2) {
switch (Constraint[0]) {
default: break;
case 'T': return C_RegisterClass;
@@ -14535,7 +15728,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'j':
// Constant suitable for movw, must be between 0 and
// 65535.
- if (Subtarget->hasV6T2Ops())
+ if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
if (CVal >= 0 && CVal <= 65535)
break;
return;
@@ -14643,7 +15836,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return;
case 'N':
- if (Subtarget->isThumb()) { // FIXME thumb2
+ if (Subtarget->isThumb1Only()) {
// This must be a constant between 0 and 31, for shift amounts.
if (CVal >= 0 && CVal <= 31)
break;
@@ -14651,7 +15844,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return;
case 'O':
- if (Subtarget->isThumb()) { // FIXME thumb2
+ if (Subtarget->isThumb1Only()) {
// This must be a multiple of 4 between -508 and 508, for
// ADD/SUB sp = sp + immediate.
if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
@@ -14874,6 +16067,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
// without FP16. So we must do a function call.
SDLoc Loc(Op);
RTLIB::Libcall LC;
+ MakeLibCallOptions CallOptions;
if (SrcSz == 16) {
// Instruction from 16 -> 32
if (Subtarget->hasFP16())
@@ -14884,7 +16078,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_EXTEND");
SrcVal =
- makeLibCall(DAG, LC, MVT::f32, SrcVal, /*isSigned*/ false, Loc).first;
+ makeLibCall(DAG, LC, MVT::f32, SrcVal, CallOptions, Loc).first;
}
}
@@ -14897,7 +16091,7 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
LC = RTLIB::getFPEXT(MVT::f32, MVT::f64);
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_EXTEND");
- return makeLibCall(DAG, LC, MVT::f64, SrcVal, /*isSigned*/ false, Loc).first;
+ return makeLibCall(DAG, LC, MVT::f64, SrcVal, CallOptions, Loc).first;
}
SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
@@ -14923,7 +16117,8 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_ROUND");
- return makeLibCall(DAG, LC, DstVT, SrcVal, /*isSigned*/ false, Loc).first;
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, Loc).first;
}
void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
@@ -15015,7 +16210,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
- Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+ Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
// volatile loads with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOLoad;
return true;
@@ -15030,7 +16225,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
- Info.align = 0;
+ Info.align.reset();
// volatile loads with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOLoad;
return true;
@@ -15056,7 +16251,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
- Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+ Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
// volatile stores with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOStore;
return true;
@@ -15077,7 +16272,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = 0;
+ Info.align.reset();
// volatile stores with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOStore;
return true;
@@ -15090,7 +16285,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+ Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
}
@@ -15102,7 +16297,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
- Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+ Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
}
@@ -15112,7 +16307,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::i64;
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
- Info.align = 8;
+ Info.align = Align(8);
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
@@ -15122,7 +16317,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::i64;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = 8;
+ Info.align = Align(8);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
@@ -15473,6 +16668,12 @@ bool ARMTargetLowering::isLegalInterleavedAccessType(
return VecSize == 64 || VecSize % 128 == 0;
}
+unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
+ if (Subtarget->hasNEON())
+ return 4;
+ return TargetLoweringBase::getMaxSupportedInterleaveFactor();
+}
+
/// Lower an interleaved load into a vldN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
@@ -15792,15 +16993,15 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
}
/// Return the correct alignment for the current calling convention.
-unsigned
-ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
- DataLayout DL) const {
+Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
+ DataLayout DL) const {
+ const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy));
if (!ArgTy->isVectorTy())
- return DL.getABITypeAlignment(ArgTy);
+ return ABITypeAlign;
// Avoid over-aligning vector parameters. It would require realigning the
// stack and waste space for no real benefit.
- return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment());
+ return std::min(ABITypeAlign, DL.getStackAlignment());
}
/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
@@ -15861,7 +17062,7 @@ void ARMTargetLowering::insertCopiesSplitCSR(
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
- unsigned NewVR = MRI->createVirtualRegister(RC);
+ Register NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 1675ec59a354..53813fad5afd 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -103,6 +103,7 @@ class VectorType;
ADDE, // Add using carry
SUBC, // Sub with carry
SUBE, // Sub using carry
+ LSLS, // Shift left producing carry
VMOVRRD, // double to two gprs.
VMOVDRR, // Two gprs to double.
@@ -126,17 +127,13 @@ class VectorType;
WIN__DBZCHK, // Windows' divide by zero check
WLS, // Low-overhead loops, While Loop Start
+ LOOP_DEC, // Really a part of LE, performs the sub
+ LE, // Low-overhead loops, Loop End
- VCEQ, // Vector compare equal.
- VCEQZ, // Vector compare equal to zero.
- VCGE, // Vector compare greater than or equal.
- VCGEZ, // Vector compare greater than or equal to zero.
- VCLEZ, // Vector compare less than or equal to zero.
- VCGEU, // Vector compare unsigned greater than or equal.
- VCGT, // Vector compare greater than.
- VCGTZ, // Vector compare greater than zero.
- VCLTZ, // Vector compare less than zero.
- VCGTU, // Vector compare unsigned greater than.
+ PREDICATE_CAST, // Predicate cast for MVE i1 types
+
+ VCMP, // Vector compare.
+ VCMPZ, // Vector compare to zero.
VTST, // Vector test bits.
// Vector shift by vector
@@ -200,6 +197,7 @@ class VectorType;
VTRN, // transpose
VTBL1, // 1-register shuffle with mask
VTBL2, // 2-register shuffle with mask
+ VMOVN, // MVE vmovn
// Vector multiply long:
VMULLs, // ...signed
@@ -221,6 +219,12 @@ class VectorType;
SMMLAR, // Signed multiply long, round and add
SMMLSR, // Signed multiply long, subtract and round
+ // Single Lane QADD8 and QADD16. Only the bottom lane. That's what the b stands for.
+ QADD8b,
+ QSUB8b,
+ QADD16b,
+ QSUB16b,
+
// Operands of the standard BUILD_VECTOR node are not legalized, which
// is fine if BUILD_VECTORs are always lowered to shuffles or other
// operations, but for ARM some BUILD_VECTORs are legal as-is and their
@@ -243,6 +247,11 @@ class VectorType;
// instructions.
MEMCPY,
+ // V8.1MMainline condition select
+ CSINV, // Conditional select invert.
+ CSNEG, // Conditional select negate.
+ CSINC, // Conditional select increment.
+
// Vector load N-element structure to all lanes:
VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
VLD2DUP,
@@ -539,7 +548,7 @@ class VectorType;
Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
- unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+ unsigned getMaxSupportedInterleaveFactor() const override;
bool lowerInterleavedLoad(LoadInst *LI,
ArrayRef<ShuffleVectorInst *> Shuffles,
@@ -608,8 +617,8 @@ class VectorType;
void finalizeLowering(MachineFunction &MF) const override;
/// Return the correct alignment for the current calling convention.
- unsigned getABIAlignmentForCallingConv(Type *ArgTy,
- DataLayout DL) const override;
+ Align getABIAlignmentForCallingConv(Type *ArgTy,
+ DataLayout DL) const override;
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
@@ -670,6 +679,8 @@ class VectorType;
SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -721,8 +732,8 @@ class VectorType;
void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const;
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;
@@ -814,7 +825,7 @@ class VectorType;
SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const;
SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
- const SDLoc &dl, bool InvalidOnQNaN) const;
+ const SDLoc &dl) const;
SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
@@ -838,7 +849,7 @@ class VectorType;
void setAllExpand(MVT VT);
};
- enum NEONModImmType {
+ enum VMOVModImmType {
VMOVModImm,
VMVNModImm,
MVEVMVNModImm,
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index bc93a058720c..1da32ad2af6c 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -188,6 +188,13 @@ def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
let DecoderMethod = "DecodeCCOutOperand";
}
+// Transform to generate the inverse of a condition code during ISel
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+ ARMCC::CondCodes CC = static_cast<ARMCC::CondCodes>(N->getZExtValue());
+ return CurDAG->getTargetConstant(ARMCC::getOppositeCondition(CC), SDLoc(N),
+ MVT::i32);
+}]>;
+
// VPT predicate
def VPTPredNOperand : AsmOperandClass {
@@ -401,6 +408,8 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
// mnemonic (when not in an IT block) or preclude it (when in an IT block).
bit thumbArithFlagSetting = 0;
+ bit validForTailPredication = 0;
+
// If this is a pseudo instruction, mark it isCodeGenOnly.
let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
@@ -412,6 +421,7 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
let TSFlags{14} = canXformTo16Bit;
let TSFlags{18-15} = D.Value;
let TSFlags{19} = thumbArithFlagSetting;
+ let TSFlags{20} = validForTailPredication;
let Constraints = cstr;
let Itinerary = itin;
@@ -455,6 +465,7 @@ class AsmPseudoInst<string asm, dag iops, dag oops = (outs)>
let isCodeGenOnly = 0; // So we get asm matcher for it.
let AsmString = asm;
let isPseudo = 1;
+ let hasNoSchedulingInfo = 1;
}
class ARMAsmPseudo<string asm, dag iops, dag oops = (outs)>
@@ -2282,7 +2293,7 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
let Inst{24} = SIMM{7};
let Inst{18-16} = SIMM{6-4};
let Inst{3-0} = SIMM{3-0};
- let DecoderMethod = "DecodeNEONModImmInstruction";
+ let DecoderMethod = "DecodeVMOVModImmInstruction";
}
// NEON 2 vector register format.
@@ -2724,6 +2735,16 @@ def complexrotateopodd : Operand<i32> {
let PrintMethod = "printComplexRotationOp<180, 90>";
}
+def MveSaturateOperand : AsmOperandClass {
+ let PredicateMethod = "isMveSaturateOp";
+ let DiagnosticString = "saturate operand must be 48 or 64";
+ let Name = "MveSaturate";
+}
+def saturateop : Operand<i32> {
+ let ParserMatchClass = MveSaturateOperand;
+ let PrintMethod = "printMveSaturateOp";
+}
+
// Data type suffix token aliases. Implements Table A7-3 in the ARM ARM.
def : TokenAlias<".s8", ".i8">;
def : TokenAlias<".u8", ".i8">;
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 388c889349b7..a802d5a06f07 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -117,7 +117,7 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
- unsigned Reg = MI->getOperand(0).getReg();
+ Register Reg = MI->getOperand(0).getReg();
MachineInstrBuilder MIB;
MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg)
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index e35145463852..fe696222ec70 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -51,8 +51,6 @@ def SDT_ARMAnd : SDTypeProfile<1, 2,
SDTCisVT<2, i32>]>;
def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
-def SDT_ARMFCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>,
- SDTCisVT<2, i32>]>;
def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
@@ -108,14 +106,24 @@ def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>,
// TODO Add another operand for 'Size' so that we can re-use this node when we
// start supporting *TP versions.
-def SDT_ARMWhileLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>,
- SDTCisVT<1, OtherVT>]>;
+def SDT_ARMLoLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>,
+ SDTCisVT<1, OtherVT>]>;
def ARMSmlald : SDNode<"ARMISD::SMLALD", SDT_LongMac>;
def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
def ARMSmlsldx : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>;
+def SDT_ARMCSel : SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisInt<3>,
+ SDTCisVT<3, i32>]>;
+
+def ARMcsinv : SDNode<"ARMISD::CSINV", SDT_ARMCSel, [SDNPOptInGlue]>;
+def ARMcsneg : SDNode<"ARMISD::CSNEG", SDT_ARMCSel, [SDNPOptInGlue]>;
+def ARMcsinc : SDNode<"ARMISD::CSINC", SDT_ARMCSel, [SDNPOptInGlue]>;
+
def SDT_MulHSR : SDTypeProfile<1, 3, [SDTCisVT<0,i32>,
SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
@@ -194,6 +202,7 @@ def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInGlue ]>;
def ARMaddc : SDNode<"ARMISD::ADDC", SDTBinaryArithWithFlags,
[SDNPCommutative]>;
def ARMsubc : SDNode<"ARMISD::SUBC", SDTBinaryArithWithFlags>;
+def ARMlsls : SDNode<"ARMISD::LSLS", SDTBinaryArithWithFlags>;
def ARMadde : SDNode<"ARMISD::ADDE", SDTBinaryArithWithFlagsInOut>;
def ARMsube : SDNode<"ARMISD::SUBE", SDTBinaryArithWithFlagsInOut>;
@@ -229,6 +238,11 @@ def ARMsmlalbt : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>;
def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>;
def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>;
+def ARMqadd8b : SDNode<"ARMISD::QADD8b", SDT_ARMAnd, []>;
+def ARMqsub8b : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>;
+def ARMqadd16b : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>;
+def ARMqsub16b : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>;
+
// Vector operations shared between NEON and MVE
def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
@@ -265,8 +279,16 @@ def ARMvshruImm : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>;
def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>;
def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>;
-def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMWhileLoop,
- [SDNPHasChain]>;
+def SDTARMVCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+ SDTCisInt<3>]>;
+def SDTARMVCMPZ : SDTypeProfile<1, 2, [SDTCisInt<2>]>;
+
+def ARMvcmp : SDNode<"ARMISD::VCMP", SDTARMVCMP>;
+def ARMvcmpz : SDNode<"ARMISD::VCMPZ", SDTARMVCMPZ>;
+
+def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;
+def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
+def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
//===----------------------------------------------------------------------===//
// ARM Flag Definitions.
@@ -1948,7 +1970,7 @@ multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
/// the function. The first operand is the ID# for this instruction, the second
/// is the index into the MachineConstantPool that this is, the third is the
/// size in bytes of this constant pool entry.
-let hasSideEffects = 0, isNotDuplicable = 1 in
+let hasSideEffects = 0, isNotDuplicable = 1, hasNoSchedulingInfo = 1 in
def CONSTPOOL_ENTRY :
PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
i32imm:$size), NoItinerary, []>;
@@ -2361,6 +2383,12 @@ let isCall = 1,
def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins arm_bl_target:$func),
8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
Requires<[IsARM]>, Sched<[WriteBr]>;
+
+ // push lr before the call
+ def BL_PUSHLR : ARMPseudoInst<(outs), (ins GPRlr:$ra, arm_bl_target:$func),
+ 4, IIC_Br,
+ []>,
+ Requires<[IsARM]>, Sched<[WriteBr]>;
}
let isBranch = 1, isTerminator = 1 in {
@@ -3727,6 +3755,23 @@ let DecoderMethod = "DecodeQADDInstruction" in
[(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))]>;
}
+def : ARMV5TEPat<(saddsat GPR:$a, GPR:$b),
+ (QADD GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(ssubsat GPR:$a, GPR:$b),
+ (QSUB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+ (QDADD rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV5TEPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
+ (QDSUB rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn),
+ (QADD8 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn),
+ (QSUB8 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqadd16b rGPR:$Rm, rGPR:$Rn),
+ (QADD16 rGPR:$Rm, rGPR:$Rn)>;
+def : ARMV6Pat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn),
+ (QSUB16 rGPR:$Rm, rGPR:$Rn)>;
+
def UQADD16 : AAIIntrinsic<0b01100110, 0b11110001, "uqadd16", int_arm_uqadd16>;
def UQADD8 : AAIIntrinsic<0b01100110, 0b11111001, "uqadd8", int_arm_uqadd8>;
def UQSUB16 : AAIIntrinsic<0b01100110, 0b11110111, "uqsub16", int_arm_uqsub16>;
@@ -4870,14 +4915,13 @@ def SB : AInoP<(outs), (ins), MiscFrm, NoItinerary, "sb", "", []>,
let hasSideEffects = 1;
}
-let usesCustomInserter = 1, Defs = [CPSR] in {
-
-// Pseudo instruction that combines movs + predicated rsbmi
-// to implement integer ABS
+let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
+ // Pseudo instruction that combines movs + predicated rsbmi
+ // to implement integer ABS
def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
}
-let usesCustomInserter = 1, Defs = [CPSR] in {
+let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
def COPY_STRUCT_BYVAL_I32 : PseudoInst<
(outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment),
NoItinerary,
@@ -5085,8 +5129,8 @@ def SWPB: AIswp<1, (outs GPRnopc:$Rt),
def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
- [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
- imm:$CRm, imm:$opc2)]>,
+ [(int_arm_cdp timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn,
+ timm:$CRm, timm:$opc2)]>,
Requires<[IsARM,PreV8]> {
bits<4> opc1;
bits<4> CRn;
@@ -5109,8 +5153,8 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
- [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
- imm:$CRm, imm:$opc2)]>,
+ [(int_arm_cdp2 timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn,
+ timm:$CRm, timm:$opc2)]>,
Requires<[IsARM,PreV8]> {
let Inst{31-28} = 0b1111;
bits<4> opc1;
@@ -5289,15 +5333,15 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
}
}
-defm LDC : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm LDCL : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm LDC2 : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
-defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm LDC : LdStCop <1, 0, "ldc", [(int_arm_ldc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm LDCL : LdStCop <1, 1, "ldcl", [(int_arm_ldcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm LDC2 : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
-defm STC : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm STCL : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm STC2 : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
-defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm STC : LdStCop <0, 0, "stc", [(int_arm_stc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm STCL : LdStCop <0, 1, "stcl", [(int_arm_stcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm STC2 : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
} // DecoderNamespace = "CoProc"
@@ -5333,8 +5377,8 @@ def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
(outs),
(ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
c_imm:$CRm, imm0_7:$opc2),
- [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
- imm:$CRm, imm:$opc2)]>,
+ [(int_arm_mcr timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn,
+ timm:$CRm, timm:$opc2)]>,
ComplexDeprecationPredicate<"MCR">;
def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
(MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
@@ -5347,8 +5391,8 @@ def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
(MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
c_imm:$CRm, 0, pred:$p)>;
-def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
- (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+def : ARMPat<(int_arm_mrc timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2),
+ (MRC p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>;
class MovRCopro2<string opc, bit direction, dag oops, dag iops,
list<dag> pattern>
@@ -5379,8 +5423,8 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
(outs),
(ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
c_imm:$CRm, imm0_7:$opc2),
- [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
- imm:$CRm, imm:$opc2)]>,
+ [(int_arm_mcr2 timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn,
+ timm:$CRm, timm:$opc2)]>,
Requires<[IsARM,PreV8]>;
def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
(MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
@@ -5394,9 +5438,9 @@ def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
(MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
c_imm:$CRm, 0)>;
-def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
- imm:$CRm, imm:$opc2),
- (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+def : ARMV5TPat<(int_arm_mrc2 timm:$cop, timm:$opc1, timm:$CRn,
+ timm:$CRm, timm:$opc2),
+ (MRC2 p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>;
class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag>
pattern = []>
@@ -5422,8 +5466,8 @@ class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag>
def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
(outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
GPRnopc:$Rt2, c_imm:$CRm),
- [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt,
- GPRnopc:$Rt2, imm:$CRm)]>;
+ [(int_arm_mcrr timm:$cop, timm:$opc1, GPRnopc:$Rt,
+ GPRnopc:$Rt2, timm:$CRm)]>;
def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */,
(outs GPRnopc:$Rt, GPRnopc:$Rt2),
(ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;
@@ -5455,8 +5499,8 @@ class MovRRCopro2<string opc, bit direction, dag oops, dag iops,
def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
(outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
GPRnopc:$Rt2, c_imm:$CRm),
- [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt,
- GPRnopc:$Rt2, imm:$CRm)]>;
+ [(int_arm_mcrr2 timm:$cop, timm:$opc1, GPRnopc:$Rt,
+ GPRnopc:$Rt2, timm:$CRm)]>;
def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */,
(outs GPRnopc:$Rt, GPRnopc:$Rt2),
@@ -5579,12 +5623,12 @@ def MSRbanked : ABI<0b0001, (outs), (ins banked_reg:$banked, GPRnopc:$Rn),
def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone,
[SDNPHasChain, SDNPSideEffect]>;
-let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
+let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP], hasNoSchedulingInfo = 1 in
def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>;
def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK,
[SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
-let usesCustomInserter = 1, Defs = [CPSR] in
+let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in
def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary,
[(win__dbzchk tGPR:$divisor)]>;
@@ -6131,10 +6175,10 @@ def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>,
ComplexDeprecationPredicate<"IT">;
-let mayLoad = 1, mayStore =1, hasSideEffects = 1 in
+let mayLoad = 1, mayStore =1, hasSideEffects = 1, hasNoSchedulingInfo = 1 in
def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
NoItinerary,
- [(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>;
+ [(set GPR:$Rd, (int_arm_space timm:$size, GPR:$Rn))]>;
//===----------------------------------
// Atomic cmpxchg for -O0
@@ -6174,4 +6218,5 @@ def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary,
let hasSideEffects = 1;
let Size = 0;
let AsmString = "@ COMPILER BARRIER";
+ let hasNoSchedulingInfo = 1;
}
diff --git a/lib/Target/ARM/ARMInstrMVE.td b/lib/Target/ARM/ARMInstrMVE.td
index 3e7ae55c7fc8..4f67cd6e47cc 100644
--- a/lib/Target/ARM/ARMInstrMVE.td
+++ b/lib/Target/ARM/ARMInstrMVE.td
@@ -160,7 +160,8 @@ class TMemImm7ShiftOffsetAsmOperand<int shift> : AsmOperandClass {
let RenderMethod = "addMemImmOffsetOperands";
}
-class taddrmode_imm7<int shift> : MemOperand {
+class taddrmode_imm7<int shift> : MemOperand,
+ ComplexPattern<i32, 2, "SelectTAddrModeImm7<"#shift#">", []> {
let ParserMatchClass = TMemImm7ShiftOffsetAsmOperand<shift>;
// They are printed the same way as the T2 imm8 version
let PrintMethod = "printT2AddrModeImm8Operand<false>";
@@ -221,7 +222,9 @@ def t2am_imm7shift0OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<0>;
def t2am_imm7shift1OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<1>;
def t2am_imm7shift2OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<2>;
-class t2am_imm7_offset<int shift> : MemOperand {
+class t2am_imm7_offset<int shift> : MemOperand,
+ ComplexPattern<i32, 1, "SelectT2AddrModeImm7Offset<"#shift#">",
+ [], [SDNPWantRoot]> {
// They are printed the same way as the imm8 version
let PrintMethod = "printT2AddrModeImm8OffsetOperand";
let ParserMatchClass =
@@ -371,6 +374,8 @@ class MVE_ScalarShiftSRegReg<string iname, bits<2> op5_4, list<dag> pattern=[]>
let Inst{7-6} = 0b00;
let Inst{5-4} = op5_4{1-0};
let Inst{3-0} = 0b1101;
+
+ let Unpredictable{8-6} = 0b111;
}
def MVE_SQRSHR : MVE_ScalarShiftSRegReg<"sqrshr", 0b10>;
@@ -403,18 +408,17 @@ class MVE_ScalarShiftDRegImm<string iname, bits<2> op5_4, bit op16,
let Inst{3-0} = 0b1111;
}
-class MVE_ScalarShiftDRegReg<string iname, bit op5, bit op16,
- list<dag> pattern=[]>
+class MVE_ScalarShiftDRegRegBase<string iname, dag iops, string asm,
+ bit op5, bit op16, list<dag> pattern=[]>
: MVE_ScalarShiftDoubleReg<
- iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm),
- "$RdaLo, $RdaHi, $Rm", "@earlyclobber $RdaHi,@earlyclobber $RdaLo,"
- "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
+ iname, iops, asm, "@earlyclobber $RdaHi,@earlyclobber $RdaLo,"
+ "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
pattern> {
bits<4> Rm;
let Inst{16} = op16;
let Inst{15-12} = Rm{3-0};
- let Inst{7-6} = 0b00;
+ let Inst{6} = 0b0;
let Inst{5} = op5;
let Inst{4} = 0b0;
let Inst{3-0} = 0b1101;
@@ -427,27 +431,44 @@ class MVE_ScalarShiftDRegReg<string iname, bit op5, bit op16,
let DecoderMethod = "DecodeMVEOverlappingLongShift";
}
-def MVE_ASRLr : MVE_ScalarShiftDRegReg<"asrl", 0b1, 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
+class MVE_ScalarShiftDRegReg<string iname, bit op5, list<dag> pattern=[]>
+ : MVE_ScalarShiftDRegRegBase<
+ iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm),
+ "$RdaLo, $RdaHi, $Rm", op5, 0b0, pattern> {
+
+ let Inst{7} = 0b0;
+}
+
+class MVE_ScalarShiftDRegRegWithSat<string iname, bit op5, list<dag> pattern=[]>
+ : MVE_ScalarShiftDRegRegBase<
+ iname, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, rGPR:$Rm, saturateop:$sat),
+ "$RdaLo, $RdaHi, $sat, $Rm", op5, 0b1, pattern> {
+ bit sat;
+
+ let Inst{7} = sat;
+}
+
+def MVE_ASRLr : MVE_ScalarShiftDRegReg<"asrl", 0b1, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMasrl tGPREven:$RdaLo_src,
tGPROdd:$RdaHi_src, rGPR:$Rm))]>;
def MVE_ASRLi : MVE_ScalarShiftDRegImm<"asrl", 0b10, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMasrl tGPREven:$RdaLo_src,
- tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>;
-def MVE_LSLLr : MVE_ScalarShiftDRegReg<"lsll", 0b0, 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
+ tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>;
+def MVE_LSLLr : MVE_ScalarShiftDRegReg<"lsll", 0b0, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMlsll tGPREven:$RdaLo_src,
tGPROdd:$RdaHi_src, rGPR:$Rm))]>;
def MVE_LSLLi : MVE_ScalarShiftDRegImm<"lsll", 0b00, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMlsll tGPREven:$RdaLo_src,
- tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>;
+ tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>;
def MVE_LSRL : MVE_ScalarShiftDRegImm<"lsrl", 0b01, ?, [(set tGPREven:$RdaLo, tGPROdd:$RdaHi,
(ARMlsrl tGPREven:$RdaLo_src,
- tGPROdd:$RdaHi_src, (i32 imm:$imm)))]>;
+ tGPROdd:$RdaHi_src, (i32 long_shift:$imm)))]>;
-def MVE_SQRSHRL : MVE_ScalarShiftDRegReg<"sqrshrl", 0b1, 0b1>;
+def MVE_SQRSHRL : MVE_ScalarShiftDRegRegWithSat<"sqrshrl", 0b1>;
def MVE_SQSHLL : MVE_ScalarShiftDRegImm<"sqshll", 0b11, 0b1>;
def MVE_SRSHRL : MVE_ScalarShiftDRegImm<"srshrl", 0b10, 0b1>;
-def MVE_UQRSHLL : MVE_ScalarShiftDRegReg<"uqrshll", 0b0, 0b1>;
+def MVE_UQRSHLL : MVE_ScalarShiftDRegRegWithSat<"uqrshll", 0b0>;
def MVE_UQSHLL : MVE_ScalarShiftDRegImm<"uqshll", 0b00, 0b1>;
def MVE_URSHRL : MVE_ScalarShiftDRegImm<"urshrl", 0b01, 0b1>;
@@ -531,6 +552,19 @@ defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>;
defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>;
defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>;
+ def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>;
+ def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPR:$src2))),
+ (i32 (MVE_VADDVu32acc $src2, $src1))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPR:$src2))),
+ (i32 (MVE_VADDVu16acc $src2, $src1))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPR:$src2))),
+ (i32 (MVE_VADDVu8acc $src2, $src1))>;
+
+}
+
class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
bit A, bit U, list<dag> pattern=[]>
: MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname,
@@ -636,6 +670,35 @@ multiclass MVE_VMINMAXV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 0b1>;
defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0b0>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))),
+ (i32 (MVE_VMAXVs8 (t2MVNi (i32 127)), $src))>;
+ def : Pat<(i32 (vecreduce_smax (v8i16 MQPR:$src))),
+ (i32 (MVE_VMAXVs16 (t2MOVi32imm (i32 -32768)), $src))>;
+ def : Pat<(i32 (vecreduce_smax (v4i32 MQPR:$src))),
+ (i32 (MVE_VMAXVs32 (t2MOVi (i32 -2147483648)), $src))>;
+ def : Pat<(i32 (vecreduce_umax (v16i8 MQPR:$src))),
+ (i32 (MVE_VMAXVu8 (t2MOVi (i32 0)), $src))>;
+ def : Pat<(i32 (vecreduce_umax (v8i16 MQPR:$src))),
+ (i32 (MVE_VMAXVu16 (t2MOVi (i32 0)), $src))>;
+ def : Pat<(i32 (vecreduce_umax (v4i32 MQPR:$src))),
+ (i32 (MVE_VMAXVu32 (t2MOVi (i32 0)), $src))>;
+
+ def : Pat<(i32 (vecreduce_smin (v16i8 MQPR:$src))),
+ (i32 (MVE_VMINVs8 (t2MOVi (i32 127)), $src))>;
+ def : Pat<(i32 (vecreduce_smin (v8i16 MQPR:$src))),
+ (i32 (MVE_VMINVs16 (t2MOVi16 (i32 32767)), $src))>;
+ def : Pat<(i32 (vecreduce_smin (v4i32 MQPR:$src))),
+ (i32 (MVE_VMINVs32 (t2MVNi (i32 -2147483648)), $src))>;
+ def : Pat<(i32 (vecreduce_umin (v16i8 MQPR:$src))),
+ (i32 (MVE_VMINVu8 (t2MOVi (i32 255)), $src))>;
+ def : Pat<(i32 (vecreduce_umin (v8i16 MQPR:$src))),
+ (i32 (MVE_VMINVu16 (t2MOVi16 (i32 65535)), $src))>;
+ def : Pat<(i32 (vecreduce_umin (v4i32 MQPR:$src))),
+ (i32 (MVE_VMINVu32 (t2MOVi (i32 4294967295)), $src))>;
+
+}
+
multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
def s8 : MVE_VMINMAXV<iname, "s8", 0b0, 0b00, 0b0, bit_7>;
def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>;
@@ -667,57 +730,57 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
let Inst{0} = bit_0;
}
-multiclass MVE_VMLAMLSDAV_X<string iname, string suffix, dag iops, string cstr,
- bit sz, bit bit_28, bit A, bit bit_8, bit bit_0,
- list<dag> pattern=[]> {
- def _noexch : MVE_VMLAMLSDAV<iname, suffix, iops, cstr, sz,
- bit_28, A, 0b0, bit_8, bit_0, pattern>;
- def _exch : MVE_VMLAMLSDAV<iname # "x", suffix, iops, cstr, sz,
- bit_28, A, 0b1, bit_8, bit_0, pattern>;
+multiclass MVE_VMLAMLSDAV_A<string iname, string x, string suffix,
+ bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
+ list<dag> pattern=[]> {
+ def ""#x#suffix : MVE_VMLAMLSDAV<iname # x, suffix,
+ (ins MQPR:$Qn, MQPR:$Qm), "",
+ sz, bit_28, 0b0, X, bit_8, bit_0, pattern>;
+ def "a"#x#suffix : MVE_VMLAMLSDAV<iname # "a" # x, suffix,
+ (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
+ "$RdaDest = $RdaSrc",
+ sz, bit_28, 0b1, X, bit_8, bit_0, pattern>;
+}
+
+multiclass MVE_VMLAMLSDAV_AX<string iname, string suffix, bit sz, bit bit_28,
+ bit bit_8, bit bit_0, list<dag> pattern=[]> {
+ defm "" : MVE_VMLAMLSDAV_A<iname, "", suffix, sz, bit_28,
+ 0b0, bit_8, bit_0, pattern>;
+ defm "" : MVE_VMLAMLSDAV_A<iname, "x", suffix, sz, bit_28,
+ 0b1, bit_8, bit_0, pattern>;
}
-multiclass MVE_VMLAMLSDAV_XA<string iname, string suffix, bit sz, bit bit_28,
- bit bit_8, bit bit_0, list<dag> pattern=[]> {
- defm _noacc : MVE_VMLAMLSDAV_X<iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
- sz, bit_28, 0b0, bit_8, bit_0, pattern>;
- defm _acc : MVE_VMLAMLSDAV_X<iname # "a", suffix,
- (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
- "$RdaDest = $RdaSrc",
- sz, bit_28, 0b1, bit_8, bit_0, pattern>;
+multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit bit_8,
+ list<dag> pattern=[]> {
+ defm "" : MVE_VMLAMLSDAV_AX<"vmladav", "s"#suffix,
+ sz, 0b0, bit_8, 0b0, pattern>;
+ defm "" : MVE_VMLAMLSDAV_A<"vmladav", "", "u"#suffix,
+ sz, 0b1, 0b0, bit_8, 0b0, pattern>;
}
-multiclass MVE_VMLADAV_multi<string suffix, bit sz, bit U, bit bit_8,
- list<dag> pattern=[]> {
- defm "" : MVE_VMLAMLSDAV_XA<"vmladav", suffix, sz, U, bit_8, 0b0, pattern>;
+multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28,
+ list<dag> pattern=[]> {
+ defm "" : MVE_VMLAMLSDAV_AX<"vmlsdav", "s"#suffix,
+ sz, bit_28, 0b0, 0b1, pattern>;
}
-defm MVE_VMLADAVs16 : MVE_VMLADAV_multi<"s16", 0b0, 0b0, 0b0>;
-defm MVE_VMLADAVs32 : MVE_VMLADAV_multi<"s32", 0b1, 0b0, 0b0>;
-defm MVE_VMLADAVu16 : MVE_VMLADAV_multi<"u16", 0b0, 0b1, 0b0>;
-defm MVE_VMLADAVu32 : MVE_VMLADAV_multi<"u32", 0b1, 0b1, 0b0>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi< "8", 0b0, 0b1>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<"16", 0b0, 0b0>;
+defm MVE_VMLADAV : MVE_VMLADAV_multi<"32", 0b1, 0b0>;
-defm MVE_VMLADAVs8 : MVE_VMLADAV_multi<"s8", 0b0, 0b0, 0b1>;
-defm MVE_VMLADAVu8 : MVE_VMLADAV_multi<"u8", 0b0, 0b1, 0b1>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi< "8", 0b0, 0b1>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"16", 0b0, 0b0>;
+defm MVE_VMLSDAV : MVE_VMLSDAV_multi<"32", 0b1, 0b0>;
// vmlav aliases vmladav
-foreach acc = ["_acc", "_noacc"] in {
+foreach acc = ["", "a"] in {
foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in {
- def : MVEInstAlias<!strconcat("vmlav", !if(!eq(acc, "_acc"), "a", ""),
- "${vp}.", suffix, "\t$RdaDest, $Qn, $Qm"),
- (!cast<Instruction>("MVE_VMLADAV"#suffix#acc#"_noexch")
+ def : MVEInstAlias<"vmlav"#acc#"${vp}."#suffix#"\t$RdaDest, $Qn, $Qm",
+ (!cast<Instruction>("MVE_VMLADAV"#acc#suffix)
tGPREven:$RdaDest, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
}
}
-multiclass MVE_VMLSDAV_multi<string suffix, bit sz, bit bit_28,
- list<dag> pattern=[]> {
- defm "" : MVE_VMLAMLSDAV_XA<"vmlsdav", suffix, sz, bit_28, 0b0, 0b1, pattern>;
-}
-
-defm MVE_VMLSDAVs8 : MVE_VMLSDAV_multi<"s8", 0, 0b1>;
-defm MVE_VMLSDAVs16 : MVE_VMLSDAV_multi<"s16", 0, 0b0>;
-defm MVE_VMLSDAVs32 : MVE_VMLSDAV_multi<"s32", 1, 0b0>;
-
// Base class for VMLALDAV and VMLSLDAV, VRMLALDAVH, VRMLSLDAVH
class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
@@ -742,82 +805,83 @@ class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
let Inst{0} = bit_0;
}
-multiclass MVE_VMLALDAVBase_X<string iname, string suffix, dag iops,
- string cstr, bit sz, bit bit_28, bit A,
- bit bit_8, bit bit_0, list<dag> pattern=[]> {
- def _noexch : MVE_VMLALDAVBase<iname, suffix, iops, cstr, sz,
- bit_28, A, 0b0, bit_8, bit_0, pattern>;
- def _exch : MVE_VMLALDAVBase<iname # "x", suffix, iops, cstr, sz,
- bit_28, A, 0b1, bit_8, bit_0, pattern>;
+multiclass MVE_VMLALDAVBase_A<string iname, string x, string suffix,
+ bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
+ list<dag> pattern=[]> {
+ def ""#x#suffix : MVE_VMLALDAVBase<
+ iname # x, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
+ sz, bit_28, 0b0, X, bit_8, bit_0, pattern>;
+ def "a"#x#suffix : MVE_VMLALDAVBase<
+ iname # "a" # x, suffix,
+ (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc, MQPR:$Qn, MQPR:$Qm),
+ "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc",
+ sz, bit_28, 0b1, X, bit_8, bit_0, pattern>;
}
-multiclass MVE_VMLALDAVBase_XA<string iname, string suffix, bit sz, bit bit_28,
- bit bit_8, bit bit_0, list<dag> pattern=[]> {
- defm _noacc : MVE_VMLALDAVBase_X<
- iname, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
- sz, bit_28, 0b0, bit_8, bit_0, pattern>;
- defm _acc : MVE_VMLALDAVBase_X<
- iname # "a", suffix, (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc,
- MQPR:$Qn, MQPR:$Qm),
- "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc",
- sz, bit_28, 0b1, bit_8, bit_0, pattern>;
+
+multiclass MVE_VMLALDAVBase_AX<string iname, string suffix, bit sz, bit bit_28,
+ bit bit_8, bit bit_0, list<dag> pattern=[]> {
+ defm "" : MVE_VMLALDAVBase_A<iname, "", suffix, sz,
+ bit_28, 0b0, bit_8, bit_0, pattern>;
+ defm "" : MVE_VMLALDAVBase_A<iname, "x", suffix, sz,
+ bit_28, 0b1, bit_8, bit_0, pattern>;
}
-multiclass MVE_VRMLALDAVH_multi<string suffix, bit U, list<dag> pattern=[]> {
- defm "" : MVE_VMLALDAVBase_XA<
- "vrmlaldavh", suffix, 0b0, U, 0b1, 0b0, pattern>;
+multiclass MVE_VRMLALDAVH_multi<string suffix, list<dag> pattern=[]> {
+ defm "" : MVE_VMLALDAVBase_AX<"vrmlaldavh", "s"#suffix,
+ 0b0, 0b0, 0b1, 0b0, pattern>;
+ defm "" : MVE_VMLALDAVBase_A<"vrmlaldavh", "", "u"#suffix,
+ 0b0, 0b1, 0b0, 0b1, 0b0, pattern>;
}
-defm MVE_VRMLALDAVHs32 : MVE_VRMLALDAVH_multi<"s32", 0>;
-defm MVE_VRMLALDAVHu32 : MVE_VRMLALDAVH_multi<"u32", 1>;
+defm MVE_VRMLALDAVH : MVE_VRMLALDAVH_multi<"32">;
// vrmlalvh aliases for vrmlaldavh
def : MVEInstAlias<"vrmlalvh${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm",
- (MVE_VRMLALDAVHs32_noacc_noexch
+ (MVE_VRMLALDAVHs32
tGPREven:$RdaLo, tGPROdd:$RdaHi,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
def : MVEInstAlias<"vrmlalvha${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm",
- (MVE_VRMLALDAVHs32_acc_noexch
+ (MVE_VRMLALDAVHas32
tGPREven:$RdaLo, tGPROdd:$RdaHi,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
def : MVEInstAlias<"vrmlalvh${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm",
- (MVE_VRMLALDAVHu32_noacc_noexch
+ (MVE_VRMLALDAVHu32
tGPREven:$RdaLo, tGPROdd:$RdaHi,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
def : MVEInstAlias<"vrmlalvha${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm",
- (MVE_VRMLALDAVHu32_acc_noexch
+ (MVE_VRMLALDAVHau32
tGPREven:$RdaLo, tGPROdd:$RdaHi,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
-multiclass MVE_VMLALDAV_multi<string suffix, bit sz, bit U,
- list<dag> pattern=[]> {
- defm "" : MVE_VMLALDAVBase_XA<"vmlaldav", suffix, sz, U, 0b0, 0b0, pattern>;
+multiclass MVE_VMLALDAV_multi<string suffix, bit sz, list<dag> pattern=[]> {
+ defm "" : MVE_VMLALDAVBase_AX<"vmlaldav", "s"#suffix, sz, 0b0, 0b0, 0b0, pattern>;
+ defm "" : MVE_VMLALDAVBase_A<"vmlaldav", "", "u"#suffix,
+ sz, 0b1, 0b0, 0b0, 0b0, pattern>;
}
-defm MVE_VMLALDAVs16 : MVE_VMLALDAV_multi<"s16", 0b0, 0b0>;
-defm MVE_VMLALDAVs32 : MVE_VMLALDAV_multi<"s32", 0b1, 0b0>;
-defm MVE_VMLALDAVu16 : MVE_VMLALDAV_multi<"u16", 0b0, 0b1>;
-defm MVE_VMLALDAVu32 : MVE_VMLALDAV_multi<"u32", 0b1, 0b1>;
+defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>;
+defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>;
// vmlalv aliases vmlaldav
-foreach acc = ["_acc", "_noacc"] in {
+foreach acc = ["", "a"] in {
foreach suffix = ["s16", "s32", "u16", "u32"] in {
- def : MVEInstAlias<!strconcat("vmlalv", !if(!eq(acc, "_acc"), "a", ""),
- "${vp}.", suffix, "\t$RdaLoDest, $RdaHiDest, $Qn, $Qm"),
- (!cast<Instruction>("MVE_VMLALDAV"#suffix#acc#"_noexch")
+ def : MVEInstAlias<"vmlalv" # acc # "${vp}." # suffix #
+ "\t$RdaLoDest, $RdaHiDest, $Qn, $Qm",
+ (!cast<Instruction>("MVE_VMLALDAV"#acc#suffix)
tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest,
MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
}
}
multiclass MVE_VMLSLDAV_multi<string iname, string suffix, bit sz,
- bit bit_28, list<dag> pattern=[]> {
- defm "" : MVE_VMLALDAVBase_XA<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>;
+ bit bit_28, list<dag> pattern=[]> {
+ defm "" : MVE_VMLALDAVBase_AX<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>;
}
-defm MVE_VMLSLDAVs16 : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>;
-defm MVE_VMLSLDAVs32 : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>;
-defm MVE_VRMLSLDAVHs32 : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>;
+defm MVE_VMLSLDAV : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>;
+defm MVE_VMLSLDAV : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>;
+defm MVE_VRMLSLDAVH : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>;
// end of mve_rDest instructions
@@ -967,11 +1031,12 @@ def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
let Inst{6} = 0b1;
let Inst{4} = 0b1;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
-class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7>
+class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7, string cstr="">
: MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), iname,
- suffix, "$Qd, $Qm", ""> {
+ suffix, "$Qd, $Qm", cstr> {
let Inst{28} = 0b1;
let Inst{25-23} = 0b111;
@@ -985,9 +1050,9 @@ class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7>
let Inst{0} = 0b0;
}
-def MVE_VREV64_8 : MVE_VREV<"vrev64", "8", 0b00, 0b00>;
-def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00>;
-def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00>;
+def MVE_VREV64_8 : MVE_VREV<"vrev64", "8", 0b00, 0b00, "@earlyclobber $Qd">;
+def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00, "@earlyclobber $Qd">;
+def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00, "@earlyclobber $Qd">;
def MVE_VREV32_8 : MVE_VREV<"vrev32", "8", 0b00, 0b01>;
def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>;
@@ -995,6 +1060,13 @@ def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>;
def MVE_VREV16_8 : MVE_VREV<"vrev16", "8", 0b00, 0b10>;
let Predicates = [HasMVEInt] in {
+ def : Pat<(v8i16 (bswap (v8i16 MQPR:$src))),
+ (v8i16 (MVE_VREV16_8 (v8i16 MQPR:$src)))>;
+ def : Pat<(v4i32 (bswap (v4i32 MQPR:$src))),
+ (v4i32 (MVE_VREV32_8 (v4i32 MQPR:$src)))>;
+}
+
+let Predicates = [HasMVEInt] in {
def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))),
(v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>;
def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))),
@@ -1026,6 +1098,7 @@ def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm),
let Inst{12-6} = 0b0010111;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
let Predicates = [HasMVEInt] in {
@@ -1054,6 +1127,7 @@ class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28>
let Inst{6} = 0b1;
let Inst{4} = 0b1;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VEOR : MVE_bit_ops<"veor", 0b00, 0b1>;
@@ -1145,6 +1219,7 @@ class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps>
class MVE_VORR<string suffix, bits<4> cmode, ExpandImm imm_type>
: MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
let Inst{5} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VORRIZ0v4i32 : MVE_VORR<"i32", 0b0001, expzero00>;
@@ -1173,6 +1248,7 @@ def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm",
class MVE_VBIC<string suffix, bits<4> cmode, ExpandImm imm_type>
: MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
let Inst{5} = 0b1;
+ let validForTailPredication = 1;
}
def MVE_VBICIZ0v4i32 : MVE_VBIC<"i32", 0b0001, expzero00>;
@@ -1315,8 +1391,12 @@ let Predicates = [HasMVEInt] in {
def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane),
(MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>;
- def : Pat<(extractelt (v8f16 MQPR:$src), imm:$lane),
- (COPY_TO_REGCLASS (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane), HPR)>;
+ def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane),
+ (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>;
+ def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane),
+ (COPY_TO_REGCLASS
+ (VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))),
+ HPR)>;
def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
@@ -1408,6 +1488,7 @@ class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract,
let Inst{12-8} = 0b01000;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
class MVE_VADD<string suffix, bits<2> size, list<dag> pattern=[]>
@@ -1442,8 +1523,8 @@ let Predicates = [HasMVEInt] in {
}
class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract,
- bits<2> size, list<dag> pattern=[]>
- : MVE_int<iname, suffix, size, pattern> {
+ bits<2> size, ValueType vt>
+ : MVE_int<iname, suffix, size, []> {
let Inst{28} = U;
let Inst{25-23} = 0b110;
@@ -1453,26 +1534,49 @@ class MVE_VQADDSUB<string iname, string suffix, bit U, bit subtract,
let Inst{8} = 0b0;
let Inst{4} = 0b1;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
+
+ ValueType VT = vt;
}
-class MVE_VQADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
- : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, pattern>;
-class MVE_VQSUB<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
- : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, pattern>;
+class MVE_VQADD<string suffix, bit U, bits<2> size, ValueType VT>
+ : MVE_VQADDSUB<"vqadd", suffix, U, 0b0, size, VT>;
+class MVE_VQSUB<string suffix, bit U, bits<2> size, ValueType VT>
+ : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size, VT>;
-def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00>;
-def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01>;
-def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10>;
-def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00>;
-def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01>;
-def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10>;
+def MVE_VQADDs8 : MVE_VQADD<"s8", 0b0, 0b00, v16i8>;
+def MVE_VQADDs16 : MVE_VQADD<"s16", 0b0, 0b01, v8i16>;
+def MVE_VQADDs32 : MVE_VQADD<"s32", 0b0, 0b10, v4i32>;
+def MVE_VQADDu8 : MVE_VQADD<"u8", 0b1, 0b00, v16i8>;
+def MVE_VQADDu16 : MVE_VQADD<"u16", 0b1, 0b01, v8i16>;
+def MVE_VQADDu32 : MVE_VQADD<"u32", 0b1, 0b10, v4i32>;
+
+def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00, v16i8>;
+def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01, v8i16>;
+def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10, v4i32>;
+def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00, v16i8>;
+def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01, v8i16>;
+def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10, v4i32>;
+
+let Predicates = [HasMVEInt] in {
+ foreach instr = [MVE_VQADDu8, MVE_VQADDu16, MVE_VQADDu32] in
+ foreach VT = [instr.VT] in
+ def : Pat<(VT (uaddsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
+ (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
+ foreach instr = [MVE_VQADDs8, MVE_VQADDs16, MVE_VQADDs32] in
+ foreach VT = [instr.VT] in
+ def : Pat<(VT (saddsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
+ (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
+ foreach instr = [MVE_VQSUBu8, MVE_VQSUBu16, MVE_VQSUBu32] in
+ foreach VT = [instr.VT] in
+ def : Pat<(VT (usubsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
+ (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
+ foreach instr = [MVE_VQSUBs8, MVE_VQSUBs16, MVE_VQSUBs32] in
+ foreach VT = [instr.VT] in
+ def : Pat<(VT (ssubsat (VT MQPR:$Qm), (VT MQPR:$Qn))),
+ (VT (instr (VT MQPR:$Qm), (VT MQPR:$Qn)))>;
+}
-def MVE_VQSUBs8 : MVE_VQSUB<"s8", 0b0, 0b00>;
-def MVE_VQSUBs16 : MVE_VQSUB<"s16", 0b0, 0b01>;
-def MVE_VQSUBs32 : MVE_VQSUB<"s32", 0b0, 0b10>;
-def MVE_VQSUBu8 : MVE_VQSUB<"u8", 0b1, 0b00>;
-def MVE_VQSUBu16 : MVE_VQSUB<"u16", 0b1, 0b01>;
-def MVE_VQSUBu32 : MVE_VQSUB<"u32", 0b1, 0b10>;
class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
: MVE_int<"vabd", suffix, size, pattern> {
@@ -1483,6 +1587,7 @@ class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
let Inst{12-8} = 0b00111;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VABDs8 : MVE_VABD_int<"s8", 0b0, 0b00>;
@@ -1501,6 +1606,7 @@ class MVE_VRHADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
let Inst{12-8} = 0b00001;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VRHADDs8 : MVE_VRHADD<"s8", 0b0, 0b00>;
@@ -1522,6 +1628,7 @@ class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
let Inst{8} = 0b0;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
class MVE_VHADD<string suffix, bit U, bits<2> size,
@@ -1545,6 +1652,60 @@ def MVE_VHSUBu8 : MVE_VHSUB<"u8", 0b1, 0b00>;
def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>;
def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (ARMvshrsImm
+ (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
+ (v16i8 (MVE_VHADDs8
+ (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
+ def : Pat<(v8i16 (ARMvshrsImm
+ (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
+ (v8i16 (MVE_VHADDs16
+ (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
+ def : Pat<(v4i32 (ARMvshrsImm
+ (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
+ (v4i32 (MVE_VHADDs32
+ (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+
+ def : Pat<(v16i8 (ARMvshruImm
+ (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
+ (v16i8 (MVE_VHADDu8
+ (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
+ def : Pat<(v8i16 (ARMvshruImm
+ (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
+ (v8i16 (MVE_VHADDu16
+ (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
+ def : Pat<(v4i32 (ARMvshruImm
+ (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
+ (v4i32 (MVE_VHADDu32
+ (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+
+ def : Pat<(v16i8 (ARMvshrsImm
+ (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
+ (v16i8 (MVE_VHSUBs8
+ (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
+ def : Pat<(v8i16 (ARMvshrsImm
+ (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
+ (v8i16 (MVE_VHSUBs16
+ (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
+ def : Pat<(v4i32 (ARMvshrsImm
+ (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
+ (v4i32 (MVE_VHSUBs32
+ (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+
+ def : Pat<(v16i8 (ARMvshruImm
+ (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
+ (v16i8 (MVE_VHSUBu8
+ (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
+ def : Pat<(v8i16 (ARMvshruImm
+ (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
+ (v8i16 (MVE_VHSUBu16
+ (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
+ def : Pat<(v4i32 (ARMvshruImm
+ (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
+ (v4i32 (MVE_VHSUBu32
+ (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
+}
+
class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
"vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> {
@@ -1563,6 +1724,7 @@ class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
let Inst{6} = 0b0;
let Inst{5} = E;
let Inst{4-0} = 0b10000;
+ let validForTailPredication = 1;
}
def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0>;
@@ -1625,6 +1787,7 @@ class MVE_VCLSCLZ<string iname, string suffix, bits<2> size,
let Inst{6} = 0b1;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VCLSs8 : MVE_VCLSCLZ<"vcls", "s8", 0b00, 0b0>;
@@ -1635,6 +1798,15 @@ def MVE_VCLZs8 : MVE_VCLSCLZ<"vclz", "i8", 0b00, 0b1>;
def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>;
def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 ( ctlz (v16i8 MQPR:$val1))),
+ (v16i8 ( MVE_VCLZs8 (v16i8 MQPR:$val1)))>;
+ def : Pat<(v4i32 ( ctlz (v4i32 MQPR:$val1))),
+ (v4i32 ( MVE_VCLZs32 (v4i32 MQPR:$val1)))>;
+ def : Pat<(v8i16 ( ctlz (v8i16 MQPR:$val1))),
+ (v8i16 ( MVE_VCLZs16 (v8i16 MQPR:$val1)))>;
+}
+
class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate,
list<dag> pattern=[]>
: MVEIntSingleSrc<iname, suffix, size, pattern> {
@@ -1648,6 +1820,7 @@ class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate,
let Inst{6} = 0b1;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VABSs8 : MVE_VABSNEG_int<"vabs", "s8", 0b00, 0b0>;
@@ -1689,6 +1862,7 @@ class MVE_VQABSNEG<string iname, string suffix, bits<2> size,
let Inst{6} = 0b1;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VQABSs8 : MVE_VQABSNEG<"vqabs", "s8", 0b00, 0b0>;
@@ -1720,6 +1894,7 @@ class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op,
let Inst{3-0} = imm{3-0};
let DecoderMethod = "DecodeMVEModImmInstruction";
+ let validForTailPredication = 1;
}
let isReMaterializable = 1 in {
@@ -2115,6 +2290,7 @@ class MVE_shift_by_vec<string iname, string suffix, bit U,
let Inst{4} = bit_4;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
multiclass mve_shift_by_vec_multi<string iname, bit bit_4, bit bit_8> {
@@ -2163,6 +2339,7 @@ class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops,
let Inst{4} = 0b1;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
}
class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm>
@@ -2175,6 +2352,7 @@ class MVE_VSxI_imm<string iname, string suffix, bit bit_8, dag imm>
let Inst{21-16} = imm;
let Inst{10-9} = 0b10;
let Inst{8} = bit_8;
+ let validForTailPredication = 1;
}
def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, (ins shr_imm8:$imm)> {
@@ -2427,6 +2605,7 @@ class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size,
let Inst{11-10} = 0b01;
let Inst{9-7} = op{2-0};
let Inst{4} = 0b0;
+ let validForTailPredication = 1;
}
@@ -2489,6 +2668,7 @@ class MVE_VMUL_fp<string suffix, bit size, list<dag> pattern=[]>
let Inst{12-8} = 0b01101;
let Inst{7} = Qn{3};
let Inst{4} = 0b1;
+ let validForTailPredication = 1;
}
def MVE_VMULf32 : MVE_VMUL_fp<"f32", 0b0>;
@@ -2556,8 +2736,38 @@ def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1,
def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1,
(ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>;
-def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>;
+let Predicates = [HasMVEFloat, UseFusedMAC] in {
+ def : Pat<(v8f16 (fadd (v8f16 MQPR:$src1),
+ (fmul (v8f16 MQPR:$src2),
+ (v8f16 MQPR:$src3)))),
+ (v8f16 (MVE_VFMAf16 $src1, $src2, $src3))>;
+ def : Pat<(v4f32 (fadd (v4f32 MQPR:$src1),
+ (fmul (v4f32 MQPR:$src2),
+ (v4f32 MQPR:$src3)))),
+ (v4f32 (MVE_VFMAf32 $src1, $src2, $src3))>;
+
+ def : Pat<(v8f16 (fsub (v8f16 MQPR:$src1),
+ (fmul (v8f16 MQPR:$src2),
+ (v8f16 MQPR:$src3)))),
+ (v8f16 (MVE_VFMSf16 $src1, $src2, $src3))>;
+ def : Pat<(v4f32 (fsub (v4f32 MQPR:$src1),
+ (fmul (v4f32 MQPR:$src2),
+ (v4f32 MQPR:$src3)))),
+ (v4f32 (MVE_VFMSf32 $src1, $src2, $src3))>;
+}
+
+let Predicates = [HasMVEFloat] in {
+ def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
+ (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>;
+ def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
+ (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>;
+}
+
+
+let validForTailPredication = 1 in {
+ def MVE_VADDf32 : MVE_VADDSUBFMA_fp<"vadd", "f32", 0b0, 0b0, 0b1, 0b0>;
+ def MVE_VADDf16 : MVE_VADDSUBFMA_fp<"vadd", "f16", 0b1, 0b0, 0b1, 0b0>;
+}
let Predicates = [HasMVEFloat] in {
def : Pat<(v4f32 (fadd (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
@@ -2566,8 +2776,11 @@ let Predicates = [HasMVEFloat] in {
(v8f16 (MVE_VADDf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
}
-def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>;
-def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>;
+
+let validForTailPredication = 1 in {
+ def MVE_VSUBf32 : MVE_VADDSUBFMA_fp<"vsub", "f32", 0b0, 0b0, 0b1, 0b1>;
+ def MVE_VSUBf16 : MVE_VADDSUBFMA_fp<"vsub", "f16", 0b1, 0b0, 0b1, 0b1>;
+}
let Predicates = [HasMVEFloat] in {
def : Pat<(v4f32 (fsub (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
@@ -2576,10 +2789,10 @@ let Predicates = [HasMVEFloat] in {
(v8f16 (MVE_VSUBf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
}
-class MVE_VCADD<string suffix, bit size, list<dag> pattern=[]>
+class MVE_VCADD<string suffix, bit size, string cstr="", list<dag> pattern=[]>
: MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
- "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> {
+ "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> {
bits<4> Qd;
bits<4> Qn;
bit rot;
@@ -2598,7 +2811,7 @@ class MVE_VCADD<string suffix, bit size, list<dag> pattern=[]>
}
def MVE_VCADDf16 : MVE_VCADD<"f16", 0b0>;
-def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1>;
+def MVE_VCADDf32 : MVE_VCADD<"f32", 0b1, "@earlyclobber $Qd">;
class MVE_VABD_fp<string suffix, bit size>
: MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
@@ -2617,6 +2830,7 @@ class MVE_VABD_fp<string suffix, bit size>
let Inst{11-8} = 0b1101;
let Inst{7} = Qn{3};
let Inst{4} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VABDf32 : MVE_VABD_fp<"f32", 0b0>;
@@ -2643,6 +2857,7 @@ class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
let Inst{4} = 0b1;
let DecoderMethod = "DecodeMVEVCVTt1fp";
+ let validForTailPredication = 1;
}
class MVE_VCVT_imm_asmop<int Bits> : AsmOperandClass {
@@ -2693,6 +2908,7 @@ class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm,
let Inst{9-8} = rm;
let Inst{7} = op;
let Inst{4} = 0b0;
+ let validForTailPredication = 1;
}
multiclass MVE_VCVT_fp_int_anpm_multi<string suffix, bits<2> size, bit op,
@@ -2727,6 +2943,7 @@ class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op,
let Inst{12-9} = 0b0011;
let Inst{8-7} = op;
let Inst{4} = 0b0;
+ let validForTailPredication = 1;
}
// The unsuffixed VCVT for float->int implicitly rounds toward zero,
@@ -2776,6 +2993,7 @@ class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate,
let Inst{11-8} = 0b0111;
let Inst{7} = negate;
let Inst{4} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>;
@@ -2863,6 +3081,7 @@ class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20,
// decoder to emit an operand that isn't affected by any instruction
// bit.
let DecoderMethod = "DecodeMVEVCMP<false," # predtype.DecoderMethod # ">";
+ let validForTailPredication = 1;
}
class MVE_VCMPqqf<string suffix, bit size>
@@ -2927,6 +3146,7 @@ class MVE_VCMPqr<string suffix, bit bit_28, bits<2> bits_21_20,
let Constraints = "";
// Custom decoder method, for the same reason as MVE_VCMPqq
let DecoderMethod = "DecodeMVEVCMP<true," # predtype.DecoderMethod # ">";
+ let validForTailPredication = 1;
}
class MVE_VCMPqrf<string suffix, bit size>
@@ -2966,6 +3186,168 @@ def MVE_VCMPs8r : MVE_VCMPqrs<"s8", 0b00>;
def MVE_VCMPs16r : MVE_VCMPqrs<"s16", 0b01>;
def MVE_VCMPs32r : MVE_VCMPqrs<"s32", 0b10>;
+multiclass unpred_vcmp_z<string suffix, int fc> {
+ def i8 : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc))>;
+ def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc))>;
+ def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc))>;
+
+ def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+}
+
+multiclass unpred_vcmp_r<string suffix, int fc> {
+ def i8 : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc))>;
+ def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc))>;
+ def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>;
+
+ def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>;
+ def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>;
+ def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>;
+
+ def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, 1, VCCR:$p1))>;
+
+ def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
+}
+
+multiclass unpred_vcmpf_z<int fc> {
+ def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc))>;
+ def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>;
+
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))))),
+ (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+}
+
+multiclass unpred_vcmpf_r<int fc> {
+ def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))),
+ (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
+ def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))),
+ (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>;
+
+ def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>;
+ def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>;
+
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))))),
+ (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, 1, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))))),
+ (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, 1, VCCR:$p1))>;
+
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))))),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, 1, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, 1, VCCR:$p1))>;
+}
+
+let Predicates = [HasMVEInt] in {
+ defm MVE_VCEQZ : unpred_vcmp_z<"i", 0>;
+ defm MVE_VCNEZ : unpred_vcmp_z<"i", 1>;
+ defm MVE_VCGEZ : unpred_vcmp_z<"s", 10>;
+ defm MVE_VCLTZ : unpred_vcmp_z<"s", 11>;
+ defm MVE_VCGTZ : unpred_vcmp_z<"s", 12>;
+ defm MVE_VCLEZ : unpred_vcmp_z<"s", 13>;
+ defm MVE_VCGTUZ : unpred_vcmp_z<"u", 8>;
+ defm MVE_VCGEUZ : unpred_vcmp_z<"u", 2>;
+
+ defm MVE_VCEQ : unpred_vcmp_r<"i", 0>;
+ defm MVE_VCNE : unpred_vcmp_r<"i", 1>;
+ defm MVE_VCGE : unpred_vcmp_r<"s", 10>;
+ defm MVE_VCLT : unpred_vcmp_r<"s", 11>;
+ defm MVE_VCGT : unpred_vcmp_r<"s", 12>;
+ defm MVE_VCLE : unpred_vcmp_r<"s", 13>;
+ defm MVE_VCGTU : unpred_vcmp_r<"u", 8>;
+ defm MVE_VCGEU : unpred_vcmp_r<"u", 2>;
+}
+
+let Predicates = [HasMVEFloat] in {
+ defm MVE_VFCEQZ : unpred_vcmpf_z<0>;
+ defm MVE_VFCNEZ : unpred_vcmpf_z<1>;
+ defm MVE_VFCGEZ : unpred_vcmpf_z<10>;
+ defm MVE_VFCLTZ : unpred_vcmpf_z<11>;
+ defm MVE_VFCGTZ : unpred_vcmpf_z<12>;
+ defm MVE_VFCLEZ : unpred_vcmpf_z<13>;
+
+ defm MVE_VFCEQ : unpred_vcmpf_r<0>;
+ defm MVE_VFCNE : unpred_vcmpf_r<1>;
+ defm MVE_VFCGE : unpred_vcmpf_r<10>;
+ defm MVE_VFCLT : unpred_vcmpf_r<11>;
+ defm MVE_VFCGT : unpred_vcmpf_r<12>;
+ defm MVE_VFCLE : unpred_vcmpf_r<13>;
+}
+
+
+// Extra "worst case" and/or/xor partterns, going into and out of GRP
+multiclass two_predops<SDPatternOperator opnode, Instruction insn> {
+ def v16i1 : Pat<(v16i1 (opnode (v16i1 VCCR:$p1), (v16i1 VCCR:$p2))),
+ (v16i1 (COPY_TO_REGCLASS
+ (insn (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p1), rGPR)),
+ (i32 (COPY_TO_REGCLASS (v16i1 VCCR:$p2), rGPR))),
+ VCCR))>;
+ def v8i1 : Pat<(v8i1 (opnode (v8i1 VCCR:$p1), (v8i1 VCCR:$p2))),
+ (v8i1 (COPY_TO_REGCLASS
+ (insn (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p1), rGPR)),
+ (i32 (COPY_TO_REGCLASS (v8i1 VCCR:$p2), rGPR))),
+ VCCR))>;
+ def v4i1 : Pat<(v4i1 (opnode (v4i1 VCCR:$p1), (v4i1 VCCR:$p2))),
+ (v4i1 (COPY_TO_REGCLASS
+ (insn (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p1), rGPR)),
+ (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p2), rGPR))),
+ VCCR))>;
+}
+
+let Predicates = [HasMVEInt] in {
+ defm POR : two_predops<or, t2ORRrr>;
+ defm PAND : two_predops<and, t2ANDrr>;
+ defm PEOR : two_predops<xor, t2EORrr>;
+}
+
+// Occasionally we need to cast between a i32 and a boolean vector, for
+// example when moving between rGPR and VPR.P0 as part of predicate vector
+// shuffles. We also sometimes need to cast between different predicate
+// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
+
+def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
+
+let Predicates = [HasMVEInt] in {
+ foreach VT = [ v4i1, v8i1, v16i1 ] in {
+ def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
+ (i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>;
+ def : Pat<(VT (predicate_cast (i32 VCCR:$src))),
+ (VT (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>;
+
+ foreach VT2 = [ v4i1, v8i1, v16i1 ] in
+ def : Pat<(VT (predicate_cast (VT2 VCCR:$src))),
+ (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
+ }
+}
+
// end of MVE compares
// start of MVE_qDest_qSrc
@@ -2989,10 +3371,10 @@ class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops,
}
class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract,
- string suffix, bits<2> size, list<dag> pattern=[]>
+ string suffix, bits<2> size, string cstr="", list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
- vpred_n, "$Qd = $Qd_src", pattern> {
+ vpred_n, "$Qd = $Qd_src"#cstr, pattern> {
bits<4> Qn;
let Inst{28} = subtract;
@@ -3009,7 +3391,7 @@ multiclass MVE_VQxDMLxDH_multi<string iname, bit exch,
bit round, bit subtract> {
def s8 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s8", 0b00>;
def s16 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s16", 0b01>;
- def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10>;
+ def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10, ",@earlyclobber $Qd">;
}
defm MVE_VQDMLADH : MVE_VQxDMLxDH_multi<"vqdmladh", 0b0, 0b0, 0b0>;
@@ -3021,10 +3403,10 @@ defm MVE_VQDMLSDHX : MVE_VQxDMLxDH_multi<"vqdmlsdhx", 0b1, 0b0, 0b1>;
defm MVE_VQRDMLSDH : MVE_VQxDMLxDH_multi<"vqrdmlsdh", 0b0, 0b1, 0b1>;
defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>;
-class MVE_VCMUL<string iname, string suffix, bit size, list<dag> pattern=[]>
+class MVE_VCMUL<string iname, string suffix, bit size, string cstr="", list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
- "$Qd, $Qn, $Qm, $rot", vpred_r, "", pattern> {
+ "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> {
bits<4> Qn;
bits<2> rot;
@@ -3041,13 +3423,13 @@ class MVE_VCMUL<string iname, string suffix, bit size, list<dag> pattern=[]>
}
def MVE_VCMULf16 : MVE_VCMUL<"vcmul", "f16", 0b0>;
-def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1>;
+def MVE_VCMULf32 : MVE_VCMUL<"vcmul", "f32", 0b1, "@earlyclobber $Qd">;
class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
- bit T, list<dag> pattern=[]>
+ bit T, string cstr, list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
- vpred_r, "", pattern> {
+ vpred_r, cstr, pattern> {
bits<4> Qd;
bits<4> Qn;
bits<4> Qm;
@@ -3063,9 +3445,9 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
}
multiclass MVE_VMULL_multi<string iname, string suffix,
- bit bit_28, bits<2> bits_21_20> {
- def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0>;
- def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1>;
+ bit bit_28, bits<2> bits_21_20, string cstr=""> {
+ def bh : MVE_VMULL<iname # "b", suffix, bit_28, bits_21_20, 0b0, cstr>;
+ def th : MVE_VMULL<iname # "t", suffix, bit_28, bits_21_20, 0b1, cstr>;
}
// For integer multiplies, bits 21:20 encode size, and bit 28 signedness.
@@ -3074,10 +3456,10 @@ multiclass MVE_VMULL_multi<string iname, string suffix,
defm MVE_VMULLs8 : MVE_VMULL_multi<"vmull", "s8", 0b0, 0b00>;
defm MVE_VMULLs16 : MVE_VMULL_multi<"vmull", "s16", 0b0, 0b01>;
-defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10>;
+defm MVE_VMULLs32 : MVE_VMULL_multi<"vmull", "s32", 0b0, 0b10, "@earlyclobber $Qd">;
defm MVE_VMULLu8 : MVE_VMULL_multi<"vmull", "u8", 0b1, 0b00>;
defm MVE_VMULLu16 : MVE_VMULL_multi<"vmull", "u16", 0b1, 0b01>;
-defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10>;
+defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10, "@earlyclobber $Qd">;
defm MVE_VMULLp8 : MVE_VMULL_multi<"vmull", "p8", 0b0, 0b11>;
defm MVE_VMULLp16 : MVE_VMULL_multi<"vmull", "p16", 0b1, 0b11>;
@@ -3144,6 +3526,18 @@ defm MVE_VQMOVNu32 : MVE_VxMOVxN_halves<"vqmovn", "u32", 0b1, 0b1, 0b01>;
defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>;
defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>;
+def MVEvmovn : SDNode<"ARMISD::VMOVN", SDTARMVEXT>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))),
+ (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+ def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))),
+ (v8i16 (MVE_VMOVNi32th (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+ def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 0))),
+ (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
+ def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))),
+ (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
+}
+
class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
@@ -3166,11 +3560,10 @@ defm MVE_VCVTf16f32 : MVE_VCVT_ff_halves<"f16.f32", 0b0>;
defm MVE_VCVTf32f16 : MVE_VCVT_ff_halves<"f32.f16", 0b1>;
class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
- list<dag> pattern=[]>
+ string cstr="", list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
- "$Qd, $Qn, $Qm, $rot", vpred_r, "",
- pattern> {
+ "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, pattern> {
bits<4> Qn;
bit rot;
@@ -3186,11 +3579,11 @@ class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
def MVE_VCADDi8 : MVE_VxCADD<"vcadd", "i8", 0b00, 0b1>;
def MVE_VCADDi16 : MVE_VxCADD<"vcadd", "i16", 0b01, 0b1>;
-def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1>;
+def MVE_VCADDi32 : MVE_VxCADD<"vcadd", "i32", 0b10, 0b1, "@earlyclobber $Qd">;
def MVE_VHCADDs8 : MVE_VxCADD<"vhcadd", "s8", 0b00, 0b0>;
def MVE_VHCADDs16 : MVE_VxCADD<"vhcadd", "s16", 0b01, 0b0>;
-def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0>;
+def MVE_VHCADDs32 : MVE_VxCADD<"vhcadd", "s32", 0b10, 0b0, "@earlyclobber $Qd">;
class MVE_VADCSBC<string iname, bit I, bit subtract,
dag carryin, list<dag> pattern=[]>
@@ -3220,10 +3613,10 @@ def MVE_VSBC : MVE_VADCSBC<"vsbc", 0b0, 0b1, (ins cl_FPSCR_NZCV:$carryin)>;
def MVE_VSBCI : MVE_VADCSBC<"vsbci", 0b1, 0b1, (ins)>;
class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
- list<dag> pattern=[]>
+ string cstr="", list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
(ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
- vpred_r, "", pattern> {
+ vpred_r, cstr, pattern> {
bits<4> Qn;
let Inst{28} = size;
@@ -3236,13 +3629,13 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
let Inst{0} = 0b1;
}
-multiclass MVE_VQDMULL_halves<string suffix, bit size> {
- def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0>;
- def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1>;
+multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> {
+ def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0, cstr>;
+ def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1, cstr>;
}
defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>;
-defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1>;
+defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1, "@earlyclobber $Qd">;
// end of mve_qDest_qSrc
@@ -3267,9 +3660,9 @@ class MVE_qr_base<dag oops, dag iops, InstrItinClass itin, string iname,
let Inst{3-0} = Rm{3-0};
}
-class MVE_qDest_rSrc<string iname, string suffix, list<dag> pattern=[]>
+class MVE_qDest_rSrc<string iname, string suffix, string cstr="", list<dag> pattern=[]>
: MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qn, rGPR:$Rm),
- NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, "",
+ NoItinerary, iname, suffix, "$Qd, $Qn, $Rm", vpred_r, cstr,
pattern>;
class MVE_qDestSrc_rSrc<string iname, string suffix, list<dag> pattern=[]>
@@ -3291,7 +3684,7 @@ class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]>
class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
bit bit_5, bit bit_12, bit bit_16,
bit bit_28, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, pattern> {
+ : MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = bit_28;
let Inst{21-20} = size;
@@ -3299,6 +3692,7 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
let Inst{12} = bit_12;
let Inst{8} = 0b1;
let Inst{5} = bit_5;
+ let validForTailPredication = 1;
}
multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix,
@@ -3320,9 +3714,27 @@ defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>;
defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>;
defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
+ (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
+ def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
+ (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
+ def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
+ (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+}
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
+ (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
+ def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
+ (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
+ def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
+ (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+}
+
class MVE_VQDMULL_qr<string iname, string suffix, bit size,
- bit T, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, pattern> {
+ bit T, string cstr="", list<dag> pattern=[]>
+ : MVE_qDest_rSrc<iname, suffix, cstr, pattern> {
let Inst{28} = size;
let Inst{21-20} = 0b11;
@@ -3332,18 +3744,18 @@ class MVE_VQDMULL_qr<string iname, string suffix, bit size,
let Inst{5} = 0b1;
}
-multiclass MVE_VQDMULL_qr_halves<string suffix, bit size> {
- def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0>;
- def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1>;
+multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> {
+ def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0, cstr>;
+ def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1, cstr>;
}
defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>;
-defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1>;
+defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1, "@earlyclobber $Qd">;
class MVE_VxADDSUB_qr<string iname, string suffix,
bit bit_28, bits<2> bits_21_20, bit subtract,
list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, pattern> {
+ : MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = bit_28;
let Inst{21-20} = bits_21_20;
@@ -3351,6 +3763,7 @@ class MVE_VxADDSUB_qr<string iname, string suffix,
let Inst{12} = subtract;
let Inst{8} = 0b1;
let Inst{5} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>;
@@ -3388,6 +3801,7 @@ class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
let Inst{12-8} = 0b11110;
let Inst{7} = bit_7;
let Inst{6-4} = 0b110;
+ let validForTailPredication = 1;
}
multiclass MVE_VxSHL_qr_types<string iname, bit bit_7, bit bit_17> {
@@ -3421,7 +3835,7 @@ let Predicates = [HasMVEInt] in {
}
class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, pattern> {
+ : MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = 0b1;
let Inst{21-20} = size;
@@ -3429,15 +3843,27 @@ class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
let Inst{12} = 0b1;
let Inst{8} = 0b0;
let Inst{5} = 0b1;
+ let validForTailPredication = 1;
}
def MVE_VBRSR8 : MVE_VBRSR<"vbrsr", "8", 0b00>;
def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>;
def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 ( bitreverse (v16i8 MQPR:$val1))),
+ (v16i8 ( MVE_VBRSR8 (v16i8 MQPR:$val1), (t2MOVi (i32 8)) ))>;
+
+ def : Pat<(v4i32 ( bitreverse (v4i32 MQPR:$val1))),
+ (v4i32 ( MVE_VBRSR32 (v4i32 MQPR:$val1), (t2MOVi (i32 32)) ))>;
+
+ def : Pat<(v8i16 ( bitreverse (v8i16 MQPR:$val1))),
+ (v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>;
+}
+
class MVE_VMUL_qr_int<string iname, string suffix,
bits<2> size, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, pattern> {
+ : MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = 0b0;
let Inst{21-20} = size;
@@ -3445,15 +3871,25 @@ class MVE_VMUL_qr_int<string iname, string suffix,
let Inst{12} = 0b1;
let Inst{8} = 0b0;
let Inst{5} = 0b1;
+ let validForTailPredication = 1;
}
def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>;
def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>;
def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
+ (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
+ def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
+ (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
+ def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
+ (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+}
+
class MVE_VxxMUL_qr<string iname, string suffix,
bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, pattern> {
+ : MVE_qDest_rSrc<iname, suffix, "", pattern> {
let Inst{28} = bit_28;
let Inst{21-20} = bits_21_20;
@@ -3471,14 +3907,14 @@ def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>;
def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>;
def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>;
-let Predicates = [HasMVEFloat] in {
+let Predicates = [HasMVEFloat], validForTailPredication = 1 in {
def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>;
def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>;
}
class MVE_VFMAMLA_qr<string iname, string suffix,
- bit bit_28, bits<2> bits_21_20, bit S,
- list<dag> pattern=[]>
+ bit bit_28, bits<2> bits_21_20, bit S,
+ list<dag> pattern=[]>
: MVE_qDestSrc_rSrc<iname, suffix, pattern> {
let Inst{28} = bit_28;
@@ -3487,6 +3923,7 @@ class MVE_VFMAMLA_qr<string iname, string suffix,
let Inst{12} = S;
let Inst{8} = 0b0;
let Inst{5} = 0b0;
+ let validForTailPredication = 1;
}
def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>;
@@ -3503,6 +3940,21 @@ def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>;
def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>;
def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v4i32 (add (v4i32 MQPR:$src1),
+ (v4i32 (mul (v4i32 MQPR:$src2),
+ (v4i32 (ARMvdup (i32 rGPR:$x))))))),
+ (v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>;
+ def : Pat<(v8i16 (add (v8i16 MQPR:$src1),
+ (v8i16 (mul (v8i16 MQPR:$src2),
+ (v8i16 (ARMvdup (i32 rGPR:$x))))))),
+ (v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>;
+ def : Pat<(v16i8 (add (v16i8 MQPR:$src1),
+ (v16i8 (mul (v16i8 MQPR:$src2),
+ (v16i8 (ARMvdup (i32 rGPR:$x))))))),
+ (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>;
+}
+
let Predicates = [HasMVEFloat] in {
def MVE_VFMA_qr_f16 : MVE_VFMAMLA_qr<"vfma", "f16", 0b1, 0b11, 0b0>;
def MVE_VFMA_qr_f32 : MVE_VFMAMLA_qr<"vfma", "f32", 0b0, 0b11, 0b0>;
@@ -3555,6 +4007,7 @@ class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12,
let Inst{7} = imm{1};
let Inst{6-1} = 0b110111;
let Inst{0} = imm{0};
+ let validForTailPredication = 1;
}
def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>;
@@ -3589,6 +4042,7 @@ class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12,
let Inst{6-4} = 0b110;
let Inst{3-1} = Rm{3-1};
let Inst{0} = imm{0};
+ let validForTailPredication = 1;
}
def MVE_VIWDUPu8 : MVE_VxWDUP<"viwdup", "u8", 0b00, 0b0>;
@@ -3599,6 +4053,7 @@ def MVE_VDWDUPu8 : MVE_VxWDUP<"vdwdup", "u8", 0b00, 0b1>;
def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
+let hasSideEffects = 1 in
class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
: MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
"$Rn", vpred_n, "", pattern> {
@@ -3614,6 +4069,7 @@ class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
let Constraints = "";
let DecoderMethod = "DecodeMveVCTP";
+ let validForTailPredication = 1;
}
def MVE_VCTP8 : MVE_VCTP<"8", 0b00>;
@@ -3621,6 +4077,15 @@ def MVE_VCTP16 : MVE_VCTP<"16", 0b01>;
def MVE_VCTP32 : MVE_VCTP<"32", 0b10>;
def MVE_VCTP64 : MVE_VCTP<"64", 0b11>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(int_arm_vctp8 rGPR:$Rn),
+ (v16i1 (MVE_VCTP8 rGPR:$Rn))>;
+ def : Pat<(int_arm_vctp16 rGPR:$Rn),
+ (v8i1 (MVE_VCTP16 rGPR:$Rn))>;
+ def : Pat<(int_arm_vctp32 rGPR:$Rn),
+ (v4i1 (MVE_VCTP32 rGPR:$Rn))>;
+}
+
// end of mve_qDest_rSrc
// start of coproc mov
@@ -3863,6 +4328,7 @@ class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc,
let mayLoad = dir.load;
let mayStore = !eq(dir.load,0);
+ let validForTailPredication = 1;
}
// Contiguous load and store instructions. These come in two main
@@ -4165,7 +4631,8 @@ class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> patte
let Inst{7} = fc{0};
let Inst{4} = 0b0;
- let Defs = [VPR, P0];
+ let Defs = [VPR];
+ let validForTailPredication = 1;
}
class MVE_VPTt1<string suffix, bits<2> size, dag iops>
@@ -4177,11 +4644,12 @@ class MVE_VPTt1<string suffix, bits<2> size, dag iops>
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
let Inst{0} = fc{1};
+ let validForTailPredication = 1;
}
class MVE_VPTt1i<string suffix, bits<2> size>
: MVE_VPTt1<suffix, size,
- (ins vpt_mask:$Mk, pred_basic_i:$fc, MQPR:$Qn, MQPR:$Qm)> {
+ (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_i:$fc)> {
let Inst{12} = 0b0;
let Inst{0} = 0b0;
}
@@ -4192,7 +4660,7 @@ def MVE_VPTv16i8 : MVE_VPTt1i<"i8", 0b00>;
class MVE_VPTt1u<string suffix, bits<2> size>
: MVE_VPTt1<suffix, size,
- (ins vpt_mask:$Mk, pred_basic_u:$fc, MQPR:$Qn, MQPR:$Qm)> {
+ (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_u:$fc)> {
let Inst{12} = 0b0;
let Inst{0} = 0b1;
}
@@ -4203,7 +4671,7 @@ def MVE_VPTv16u8 : MVE_VPTt1u<"u8", 0b00>;
class MVE_VPTt1s<string suffix, bits<2> size>
: MVE_VPTt1<suffix, size,
- (ins vpt_mask:$Mk, pred_basic_s:$fc, MQPR:$Qn, MQPR:$Qm)> {
+ (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_s:$fc)> {
let Inst{12} = 0b1;
}
@@ -4225,7 +4693,7 @@ class MVE_VPTt2<string suffix, bits<2> size, dag iops>
class MVE_VPTt2i<string suffix, bits<2> size>
: MVE_VPTt2<suffix, size,
- (ins vpt_mask:$Mk, pred_basic_i:$fc, MQPR:$Qn, GPRwithZR:$Rm)> {
+ (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_i:$fc)> {
let Inst{12} = 0b0;
let Inst{5} = 0b0;
}
@@ -4236,7 +4704,7 @@ def MVE_VPTv16i8r : MVE_VPTt2i<"i8", 0b00>;
class MVE_VPTt2u<string suffix, bits<2> size>
: MVE_VPTt2<suffix, size,
- (ins vpt_mask:$Mk, pred_basic_u:$fc, MQPR:$Qn, GPRwithZR:$Rm)> {
+ (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_u:$fc)> {
let Inst{12} = 0b0;
let Inst{5} = 0b1;
}
@@ -4247,7 +4715,7 @@ def MVE_VPTv16u8r : MVE_VPTt2u<"u8", 0b00>;
class MVE_VPTt2s<string suffix, bits<2> size>
: MVE_VPTt2<suffix, size,
- (ins vpt_mask:$Mk, pred_basic_s:$fc, MQPR:$Qn, GPRwithZR:$Rm)> {
+ (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_s:$fc)> {
let Inst{12} = 0b1;
}
@@ -4276,12 +4744,13 @@ class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern=
let Inst{7} = fc{0};
let Inst{4} = 0b0;
- let Defs = [P0];
+ let Defs = [VPR];
let Predicates = [HasMVEFloat];
+ let validForTailPredication = 1;
}
class MVE_VPTft1<string suffix, bit size>
- : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, pred_basic_fp:$fc, MQPR:$Qn, MQPR:$Qm),
+ : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, MQPR:$Qn, MQPR:$Qm, pred_basic_fp:$fc),
"$fc, $Qn, $Qm"> {
bits<3> fc;
bits<4> Qm;
@@ -4296,7 +4765,7 @@ def MVE_VPTv4f32 : MVE_VPTft1<"f32", 0b0>;
def MVE_VPTv8f16 : MVE_VPTft1<"f16", 0b1>;
class MVE_VPTft2<string suffix, bit size>
- : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, pred_basic_fp:$fc, MQPR:$Qn, GPRwithZR:$Rm),
+ : MVE_VPTf<suffix, size, (ins vpt_mask:$Mk, MQPR:$Qn, GPRwithZR:$Rm, pred_basic_fp:$fc),
"$fc, $Qn, $Rm"> {
bits<3> fc;
bits<4> Rm;
@@ -4322,7 +4791,8 @@ def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary,
let Unpredictable{7} = 0b1;
let Unpredictable{5} = 0b1;
- let Defs = [P0];
+ let Uses = [VPR];
+ let validForTailPredication = 1;
}
def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
@@ -4346,6 +4816,7 @@ def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
let Inst{4} = 0b0;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;
+ let validForTailPredication = 1;
}
foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32",
@@ -4353,19 +4824,113 @@ foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32",
def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm",
(MVE_VPSEL MQPR:$Qd, MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
-def MVE_VPNOT : MVE_p<(outs), (ins), NoItinerary,
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
+ (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+ def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
+ (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+ def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
+ (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+
+ def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
+ (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+ def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
+ (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+
+ def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
+ (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+ (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), 1)))>;
+ def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
+ (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+ (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>;
+ def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
+ (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+ (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>;
+
+ def : Pat<(v8f16 (vselect (v8i16 MQPR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
+ (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+ (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>;
+ def : Pat<(v4f32 (vselect (v4i32 MQPR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
+ (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
+ (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>;
+
+ // Pred <-> Int
+ def : Pat<(v16i8 (zext (v16i1 VCCR:$pred))),
+ (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+ def : Pat<(v8i16 (zext (v8i1 VCCR:$pred))),
+ (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+ def : Pat<(v4i32 (zext (v4i1 VCCR:$pred))),
+ (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+
+ def : Pat<(v16i8 (sext (v16i1 VCCR:$pred))),
+ (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+ def : Pat<(v8i16 (sext (v8i1 VCCR:$pred))),
+ (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+ def : Pat<(v4i32 (sext (v4i1 VCCR:$pred))),
+ (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+
+ def : Pat<(v16i8 (anyext (v16i1 VCCR:$pred))),
+ (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+ def : Pat<(v8i16 (anyext (v8i1 VCCR:$pred))),
+ (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+ def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))),
+ (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+
+ def : Pat<(v16i1 (trunc (v16i8 MQPR:$v1))),
+ (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, 1))>;
+ def : Pat<(v8i1 (trunc (v8i16 MQPR:$v1))),
+ (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, 1))>;
+ def : Pat<(v4i1 (trunc (v4i32 MQPR:$v1))),
+ (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, 1))>;
+}
+
+let Predicates = [HasMVEFloat] in {
+ // Pred <-> Float
+ // 112 is 1.0 in float
+ def : Pat<(v4f32 (uint_to_fp (v4i1 VCCR:$pred))),
+ (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>;
+ // 2620 in 1.0 in half
+ def : Pat<(v8f16 (uint_to_fp (v8i1 VCCR:$pred))),
+ (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>;
+ // 240 is -1.0 in float
+ def : Pat<(v4f32 (sint_to_fp (v4i1 VCCR:$pred))),
+ (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>;
+ // 2748 is -1.0 in half
+ def : Pat<(v8f16 (sint_to_fp (v8i1 VCCR:$pred))),
+ (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>;
+
+ def : Pat<(v4i1 (fp_to_uint (v4f32 MQPR:$v1))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>;
+ def : Pat<(v8i1 (fp_to_uint (v8f16 MQPR:$v1))),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>;
+ def : Pat<(v4i1 (fp_to_sint (v4f32 MQPR:$v1))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>;
+ def : Pat<(v8i1 (fp_to_sint (v8f16 MQPR:$v1))),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>;
+}
+
+def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary,
"vpnot", "", "", vpred_n, "", []> {
let Inst{31-0} = 0b11111110001100010000111101001101;
let Unpredictable{19-17} = 0b111;
let Unpredictable{12} = 0b1;
let Unpredictable{7} = 0b1;
let Unpredictable{5} = 0b1;
- let Defs = [P0];
- let Uses = [P0];
let Constraints = "";
+ let DecoderMethod = "DecodeMVEVPNOT";
}
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v4i1 (xor (v4i1 VCCR:$pred), (v4i1 (predicate_cast (i32 65535))))),
+ (v4i1 (MVE_VPNOT (v4i1 VCCR:$pred)))>;
+ def : Pat<(v8i1 (xor (v8i1 VCCR:$pred), (v8i1 (predicate_cast (i32 65535))))),
+ (v8i1 (MVE_VPNOT (v8i1 VCCR:$pred)))>;
+ def : Pat<(v16i1 (xor (v16i1 VCCR:$pred), (v16i1 (predicate_cast (i32 65535))))),
+ (v16i1 (MVE_VPNOT (v16i1 VCCR:$pred)))>;
+}
+
+
class MVE_loltp_start<dag iops, string asm, string ops, bits<2> size>
: t2LOL<(outs GPRlr:$LR), iops, asm, ops> {
bits<4> Rn;
@@ -4433,159 +4998,440 @@ def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> {
// Patterns
//===----------------------------------------------------------------------===//
-class MVE_unpred_vector_store_typed<ValueType Ty, Instruction RegImmInst,
+class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag StoreKind, int shift>
+ : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
+ (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
+class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag StoreKind, int shift>
+ : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
+ (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred)>;
+
+multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
+ int shift> {
+ def : MVE_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
+}
+
+class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag LoadKind, int shift>
+ : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
+ (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
+class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
+ PatFrag LoadKind, int shift>
+ : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
+ (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred))>;
+
+multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
+ int shift> {
+ def : MVE_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
+ def : MVE_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
+}
+
+class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode,
PatFrag StoreKind, int shift>
- : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
- (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
+ : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr),
+ (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>;
-multiclass MVE_unpred_vector_store<Instruction RegImmInst, PatFrag StoreKind,
+multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind,
int shift> {
- def : MVE_unpred_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>;
- def : MVE_unpred_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>;
- def : MVE_unpred_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>;
- def : MVE_unpred_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
- def : MVE_unpred_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
- def : MVE_unpred_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
- def : MVE_unpred_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
-}
-
-class MVE_unpred_vector_load_typed<ValueType Ty, Instruction RegImmInst,
- PatFrag LoadKind, int shift>
- : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
- (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
-
-multiclass MVE_unpred_vector_load<Instruction RegImmInst, PatFrag LoadKind,
- int shift> {
- def : MVE_unpred_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
- def : MVE_unpred_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>;
- def : MVE_unpred_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>;
- def : MVE_unpred_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
- def : MVE_unpred_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
- def : MVE_unpred_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
- def : MVE_unpred_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
-}
+ def : MVE_vector_offset_store_typed<v16i8, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v8i16, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v8f16, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v4i32, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v4f32, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+ def : MVE_vector_offset_store_typed<v2f64, RegImmInst, StoreKind, shift>;
+}
+
+def aligned32_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+ (pre_store node:$val, node:$ptr, node:$offset), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned32_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+ (post_store node:$val, node:$ptr, node:$offset), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned16_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+ (pre_store node:$val, node:$ptr, node:$offset), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+ (post_store node:$val, node:$ptr, node:$offset), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+
+
+def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ return Ld->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def sextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+}]>;
+def zextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
+}]>;
+def extmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+ return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
+}]>;
+def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2;
+}]>;
+def sextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+}]>;
+def zextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
+}]>;
+def extmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+ return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
+}]>;
+def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4;
+}]>;
+
+def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (masked_st node:$val, node:$ptr, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (maskedstore8 node:$val, node:$ptr, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (masked_st node:$val, node:$ptr, node:$pred), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+
+def truncatingmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (maskedstore16 node:$val, node:$ptr, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (masked_st node:$val, node:$ptr, node:$pred), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
+}]>;
let Predicates = [HasMVEInt, IsLE] in {
- defm : MVE_unpred_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
- defm : MVE_unpred_vector_store<MVE_VSTRHU16, hword_alignedstore, 1>;
- defm : MVE_unpred_vector_store<MVE_VSTRWU32, alignedstore32, 2>;
+ // Stores
+ defm : MVE_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
+ defm : MVE_vector_store<MVE_VSTRHU16, hword_alignedstore, 1>;
+ defm : MVE_vector_store<MVE_VSTRWU32, alignedstore32, 2>;
- defm : MVE_unpred_vector_load<MVE_VLDRBU8, byte_alignedload, 0>;
- defm : MVE_unpred_vector_load<MVE_VLDRHU16, hword_alignedload, 1>;
- defm : MVE_unpred_vector_load<MVE_VLDRWU32, alignedload32, 2>;
+ // Loads
+ defm : MVE_vector_load<MVE_VLDRBU8, byte_alignedload, 0>;
+ defm : MVE_vector_load<MVE_VLDRHU16, hword_alignedload, 1>;
+ defm : MVE_vector_load<MVE_VLDRWU32, alignedload32, 2>;
- def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)),
- (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
- def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)),
- (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
- def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)),
- (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>;
+ // Pre/post inc stores
+ defm : MVE_vector_offset_store<MVE_VSTRBU8_pre, pre_store, 0>;
+ defm : MVE_vector_offset_store<MVE_VSTRBU8_post, post_store, 0>;
+ defm : MVE_vector_offset_store<MVE_VSTRHU16_pre, aligned16_pre_store, 1>;
+ defm : MVE_vector_offset_store<MVE_VSTRHU16_post, aligned16_post_store, 1>;
+ defm : MVE_vector_offset_store<MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
+ defm : MVE_vector_offset_store<MVE_VSTRWU32_post, aligned32_post_store, 2>;
}
let Predicates = [HasMVEInt, IsBE] in {
- def : MVE_unpred_vector_store_typed<v16i8, MVE_VSTRBU8, store, 0>;
- def : MVE_unpred_vector_store_typed<v8i16, MVE_VSTRHU16, alignedstore16, 1>;
- def : MVE_unpred_vector_store_typed<v8f16, MVE_VSTRHU16, alignedstore16, 1>;
- def : MVE_unpred_vector_store_typed<v4i32, MVE_VSTRWU32, alignedstore32, 2>;
- def : MVE_unpred_vector_store_typed<v4f32, MVE_VSTRWU32, alignedstore32, 2>;
-
- def : MVE_unpred_vector_load_typed<v16i8, MVE_VLDRBU8, load, 0>;
- def : MVE_unpred_vector_load_typed<v8i16, MVE_VLDRHU16, alignedload16, 1>;
- def : MVE_unpred_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>;
- def : MVE_unpred_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>;
- def : MVE_unpred_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>;
+ // Aligned Stores
+ def : MVE_vector_store_typed<v16i8, MVE_VSTRBU8, store, 0>;
+ def : MVE_vector_store_typed<v8i16, MVE_VSTRHU16, alignedstore16, 1>;
+ def : MVE_vector_store_typed<v8f16, MVE_VSTRHU16, alignedstore16, 1>;
+ def : MVE_vector_store_typed<v4i32, MVE_VSTRWU32, alignedstore32, 2>;
+ def : MVE_vector_store_typed<v4f32, MVE_VSTRWU32, alignedstore32, 2>;
+
+ // Aligned Loads
+ def : MVE_vector_load_typed<v16i8, MVE_VLDRBU8, load, 0>;
+ def : MVE_vector_load_typed<v8i16, MVE_VLDRHU16, alignedload16, 1>;
+ def : MVE_vector_load_typed<v8f16, MVE_VLDRHU16, alignedload16, 1>;
+ def : MVE_vector_load_typed<v4i32, MVE_VLDRWU32, alignedload32, 2>;
+ def : MVE_vector_load_typed<v4f32, MVE_VLDRWU32, alignedload32, 2>;
+
+ // Other unaligned loads/stores need to go though a VREV
+ def : Pat<(v2f64 (load t2addrmode_imm7<0>:$addr)),
+ (v2f64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+ def : Pat<(v2i64 (load t2addrmode_imm7<0>:$addr)),
+ (v2i64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+ def : Pat<(v4i32 (load t2addrmode_imm7<0>:$addr)),
+ (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+ def : Pat<(v4f32 (load t2addrmode_imm7<0>:$addr)),
+ (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+ def : Pat<(v8i16 (load t2addrmode_imm7<0>:$addr)),
+ (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+ def : Pat<(v8f16 (load t2addrmode_imm7<0>:$addr)),
+ (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>;
+ def : Pat<(store (v2f64 MQPR:$val), t2addrmode_imm7<0>:$addr),
+ (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+ def : Pat<(store (v2i64 MQPR:$val), t2addrmode_imm7<0>:$addr),
+ (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+ def : Pat<(store (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr),
+ (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+ def : Pat<(store (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr),
+ (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+ def : Pat<(store (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr),
+ (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+ def : Pat<(store (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr),
+ (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>;
+
+ // Pre/Post inc stores
+ def : MVE_vector_offset_store_typed<v16i8, MVE_VSTRBU8_pre, pre_store, 0>;
+ def : MVE_vector_offset_store_typed<v16i8, MVE_VSTRBU8_post, post_store, 0>;
+ def : MVE_vector_offset_store_typed<v8i16, MVE_VSTRHU16_pre, aligned16_pre_store, 1>;
+ def : MVE_vector_offset_store_typed<v8i16, MVE_VSTRHU16_post, aligned16_post_store, 1>;
+ def : MVE_vector_offset_store_typed<v8f16, MVE_VSTRHU16_pre, aligned16_pre_store, 1>;
+ def : MVE_vector_offset_store_typed<v8f16, MVE_VSTRHU16_post, aligned16_post_store, 1>;
+ def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
+ def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
+ def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
+ def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
}
+let Predicates = [HasMVEInt] in {
+ // Aligned masked store, shared between LE and BE
+ def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, maskedstore8, 0>;
+ def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, maskedstore16, 1>;
+ def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, maskedstore16, 1>;
+ def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, maskedstore32, 2>;
+ def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, maskedstore32, 2>;
+ // Truncating stores
+ def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+ (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+ def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+ (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+ def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr, VCCR:$pred),
+ (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>;
+ // Aligned masked loads
+ def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload8, 0>;
+ def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>;
+ def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>;
+ def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+ def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+ // Extending masked loads.
+ def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v8i16 NEONimmAllZerosV))),
+ (v8i16 (MVE_VLDRBS16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRBS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v8i16 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v8i16 NEONimmAllZerosV))),
+ (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v8i16 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v8i16 NEONimmAllZerosV))),
+ (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (sextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRHS32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (zextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (extmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+}
// Widening/Narrowing Loads/Stores
+let MinAlignment = 2 in {
+ def truncstorevi16_align2 : PatFrag<(ops node:$val, node:$ptr),
+ (truncstorevi16 node:$val, node:$ptr)>;
+ def post_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset),
+ (post_truncstvi16 node:$val, node:$base, node:$offset)>;
+ def pre_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset),
+ (pre_truncstvi16 node:$val, node:$base, node:$offset)>;
+}
+
let Predicates = [HasMVEInt] in {
- def : Pat<(truncstorevi8 (v8i16 MQPR:$val), t2addrmode_imm7<1>:$addr),
- (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<1>:$addr)>;
- def : Pat<(truncstorevi8 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr),
- (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<1>:$addr)>;
- def : Pat<(truncstorevi16 (v4i32 MQPR:$val), t2addrmode_imm7<2>:$addr),
- (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<2>:$addr)>;
+ def : Pat<(truncstorevi8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr),
+ (MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr)>;
+ def : Pat<(truncstorevi8 (v4i32 MQPR:$val), taddrmode_imm7<0>:$addr),
+ (MVE_VSTRB32 MQPR:$val, taddrmode_imm7<0>:$addr)>;
+ def : Pat<(truncstorevi16_align2 (v4i32 MQPR:$val), taddrmode_imm7<1>:$addr),
+ (MVE_VSTRH32 MQPR:$val, taddrmode_imm7<1>:$addr)>;
+
+ def : Pat<(post_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
+ (MVE_VSTRB16_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
+ def : Pat<(post_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
+ (MVE_VSTRB32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
+ def : Pat<(post_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr),
+ (MVE_VSTRH32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>;
+
+ def : Pat<(pre_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
+ (MVE_VSTRB16_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
+ def : Pat<(pre_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
+ (MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
+ def : Pat<(pre_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr),
+ (MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>;
+}
+
+
+let MinAlignment = 2 in {
+ def extloadvi16_align2 : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>;
+ def sextloadvi16_align2 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>;
+ def zextloadvi16_align2 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>;
}
multiclass MVEExtLoad<string DestLanes, string DestElemBits,
string SrcElemBits, string SrcElemType,
- Operand am> {
+ string Align, Operand am> {
def _Any : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
- (!cast<PatFrag>("extloadvi" # SrcElemBits) am:$addr)),
+ (!cast<PatFrag>("extloadvi" # SrcElemBits # Align) am:$addr)),
(!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits)
am:$addr)>;
def _Z : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
- (!cast<PatFrag>("zextloadvi" # SrcElemBits) am:$addr)),
+ (!cast<PatFrag>("zextloadvi" # SrcElemBits # Align) am:$addr)),
(!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits)
am:$addr)>;
def _S : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
- (!cast<PatFrag>("sextloadvi" # SrcElemBits) am:$addr)),
+ (!cast<PatFrag>("sextloadvi" # SrcElemBits # Align) am:$addr)),
(!cast<Instruction>("MVE_VLDR" # SrcElemType # "S" # DestElemBits)
am:$addr)>;
}
let Predicates = [HasMVEInt] in {
- defm : MVEExtLoad<"4", "32", "8", "B", t2addrmode_imm7<1>>;
- defm : MVEExtLoad<"8", "16", "8", "B", t2addrmode_imm7<1>>;
- defm : MVEExtLoad<"4", "32", "16", "H", t2addrmode_imm7<2>>;
+ defm : MVEExtLoad<"4", "32", "8", "B", "", taddrmode_imm7<0>>;
+ defm : MVEExtLoad<"8", "16", "8", "B", "", taddrmode_imm7<0>>;
+ defm : MVEExtLoad<"4", "32", "16", "H", "_align2", taddrmode_imm7<1>>;
}
// Bit convert patterns
let Predicates = [HasMVEInt] in {
- def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v2i64 MQPR:$src))), (v2f64 MQPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v2f64 MQPR:$src))), (v2i64 MQPR:$src)>;
- def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v4f32 MQPR:$src))), (v4i32 MQPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v4i32 MQPR:$src))), (v4f32 MQPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v8f16 MQPR:$src))), (v8i16 MQPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v8i16 MQPR:$src))), (v8f16 MQPR:$src)>;
}
let Predicates = [IsLE,HasMVEInt] in {
- def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
- def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
-
- def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
- def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
- def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>;
- def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
- def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
-
- def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
- def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
-
- def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
- def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
- def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>;
- def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
- def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
-
- def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>;
- def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>;
- def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>;
- def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>;
- def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>;
-
- def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
- def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
-
- def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>;
- def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 MQPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 MQPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 MQPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 MQPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 MQPR:$src)>;
+
+ def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 MQPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 MQPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 MQPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 MQPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 MQPR:$src)>;
+
+ def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 MQPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 MQPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 MQPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 MQPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 MQPR:$src)>;
+
+ def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 MQPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 MQPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 MQPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 MQPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 MQPR:$src)>;
+
+ def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 MQPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 MQPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 MQPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 MQPR:$src)>;
+ def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 MQPR:$src)>;
+
+ def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 MQPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 MQPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 MQPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 MQPR:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 MQPR:$src)>;
+
+ def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 MQPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 MQPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 MQPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 MQPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 MQPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 MQPR:$src)>;
+}
+
+let Predicates = [IsBE,HasMVEInt] in {
+ def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 (MVE_VREV64_8 MQPR:$src))>;
+
+ def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 (MVE_VREV64_8 MQPR:$src))>;
+
+ def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 (MVE_VREV32_8 MQPR:$src))>;
+
+ def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>;
+ def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 (MVE_VREV32_8 MQPR:$src))>;
+
+ def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 (MVE_VREV16_8 MQPR:$src))>;
+
+ def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>;
+ def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>;
+ def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 (MVE_VREV16_8 MQPR:$src))>;
+
+ def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>;
+ def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>;
+ def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>;
+ def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>;
+ def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>;
+ def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>;
}
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 806681df102c..60ca92e58041 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -15,22 +15,22 @@
// NEON-specific Operands.
//===----------------------------------------------------------------------===//
def nModImm : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
}
def nImmSplatI8AsmOperand : AsmOperandClass { let Name = "NEONi8splat"; }
def nImmSplatI8 : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmSplatI8AsmOperand;
}
def nImmSplatI16AsmOperand : AsmOperandClass { let Name = "NEONi16splat"; }
def nImmSplatI16 : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmSplatI16AsmOperand;
}
def nImmSplatI32AsmOperand : AsmOperandClass { let Name = "NEONi32splat"; }
def nImmSplatI32 : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmSplatI32AsmOperand;
}
def nImmSplatNotI16AsmOperand : AsmOperandClass { let Name = "NEONi16splatNot"; }
@@ -43,7 +43,7 @@ def nImmSplatNotI32 : Operand<i32> {
}
def nImmVMOVI32AsmOperand : AsmOperandClass { let Name = "NEONi32vmov"; }
def nImmVMOVI32 : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmVMOVI32AsmOperand;
}
@@ -62,18 +62,18 @@ class nImmVINVIAsmOperandReplicate<ValueType From, ValueType To>
}
class nImmVMOVIReplicate<ValueType From, ValueType To> : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmVMOVIAsmOperandReplicate<From, To>;
}
class nImmVINVIReplicate<ValueType From, ValueType To> : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmVINVIAsmOperandReplicate<From, To>;
}
def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; }
def nImmVMOVI32Neg : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmVMOVI32NegAsmOperand;
}
def nImmVMOVF32 : Operand<i32> {
@@ -82,7 +82,7 @@ def nImmVMOVF32 : Operand<i32> {
}
def nImmSplatI64AsmOperand : AsmOperandClass { let Name = "NEONi64splat"; }
def nImmSplatI64 : Operand<i32> {
- let PrintMethod = "printNEONModImmOperand";
+ let PrintMethod = "printVMOVModImmOperand";
let ParserMatchClass = nImmSplatI64AsmOperand;
}
@@ -478,20 +478,8 @@ def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
// NEON-specific DAG Nodes.
//===----------------------------------------------------------------------===//
-def SDTARMVCMP : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
-def SDTARMVCMPZ : SDTypeProfile<1, 1, []>;
-
-def NEONvceq : SDNode<"ARMISD::VCEQ", SDTARMVCMP>;
-def NEONvceqz : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>;
-def NEONvcge : SDNode<"ARMISD::VCGE", SDTARMVCMP>;
-def NEONvcgez : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>;
-def NEONvclez : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>;
-def NEONvcgeu : SDNode<"ARMISD::VCGEU", SDTARMVCMP>;
-def NEONvcgt : SDNode<"ARMISD::VCGT", SDTARMVCMP>;
-def NEONvcgtz : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>;
-def NEONvcltz : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>;
-def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>;
-def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>;
+def SDTARMVTST : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
+def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVTST>;
// Types for vector shift by immediates. The "SHX" version is for long and
// narrow operations where the source and destination vectors have different
@@ -559,14 +547,14 @@ def NEONvtbl2 : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>;
def NEONimmAllZerosV: PatLeaf<(ARMvmovImm (i32 timm)), [{
ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
unsigned EltBits = 0;
- uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+ uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
return (EltBits == 32 && EltVal == 0);
}]>;
def NEONimmAllOnesV: PatLeaf<(ARMvmovImm (i32 timm)), [{
ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
unsigned EltBits = 0;
- uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+ uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
return (EltBits == 8 && EltVal == 0xff);
}]>;
@@ -3326,30 +3314,30 @@ class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
// source operand element sizes of 8, 16 and 32 bits:
multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
bits<5> op11_7, bit op4, string opc, string Dt,
- string asm, SDNode OpNode> {
+ string asm, int fc> {
// 64-bit vector types.
def v8i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "8"), asm, "",
- [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>;
+ [(set DPR:$Vd, (v8i8 (ARMvcmpz (v8i8 DPR:$Vm), (i32 fc))))]>;
def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "16"), asm, "",
- [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>;
+ [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4i16 DPR:$Vm), (i32 fc))))]>;
def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "32"), asm, "",
- [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>;
+ [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2i32 DPR:$Vm), (i32 fc))))]>;
def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, "f32", asm, "",
- [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> {
+ [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2f32 DPR:$Vm), (i32 fc))))]> {
let Inst{10} = 1; // overwrite F = 1
}
def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
(outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
opc, "f16", asm, "",
- [(set DPR:$Vd, (v4i16 (OpNode (v4f16 DPR:$Vm))))]>,
+ [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4f16 DPR:$Vm), (i32 fc))))]>,
Requires<[HasNEON,HasFullFP16]> {
let Inst{10} = 1; // overwrite F = 1
}
@@ -3358,30 +3346,83 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "8"), asm, "",
- [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>;
+ [(set QPR:$Vd, (v16i8 (ARMvcmpz (v16i8 QPR:$Vm), (i32 fc))))]>;
def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "16"), asm, "",
- [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>;
+ [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8i16 QPR:$Vm), (i32 fc))))]>;
def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, !strconcat(Dt, "32"), asm, "",
- [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>;
+ [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4i32 QPR:$Vm), (i32 fc))))]>;
def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, "f32", asm, "",
- [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> {
+ [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4f32 QPR:$Vm), (i32 fc))))]> {
let Inst{10} = 1; // overwrite F = 1
}
def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
(outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
opc, "f16", asm, "",
- [(set QPR:$Vd, (v8i16 (OpNode (v8f16 QPR:$Vm))))]>,
+ [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8f16 QPR:$Vm), (i32 fc))))]>,
Requires<[HasNEON,HasFullFP16]> {
let Inst{10} = 1; // overwrite F = 1
}
}
+// Neon 3-register comparisons.
+class N3VQ_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, int fc, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 1, op4,
+ (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set QPR:$Vd, (ResTy (ARMvcmp (OpTy QPR:$Vn), (OpTy QPR:$Vm), (i32 fc))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = Commutable;
+}
+
+class N3VD_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+ InstrItinClass itin, string OpcodeStr, string Dt,
+ ValueType ResTy, ValueType OpTy, int fc, bit Commutable>
+ : N3V<op24, op23, op21_20, op11_8, 0, op4,
+ (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+ OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+ [(set DPR:$Vd, (ResTy (ARMvcmp (OpTy DPR:$Vn), (OpTy DPR:$Vm), (i32 fc))))]> {
+ // All of these have a two-operand InstAlias.
+ let TwoOperandAliasConstraint = "$Vn = $Vd";
+ let isCommutable = Commutable;
+}
+
+multiclass N3V_QHS_cmp<bit op24, bit op23, bits<4> op11_8, bit op4,
+ InstrItinClass itinD16, InstrItinClass itinD32,
+ InstrItinClass itinQ16, InstrItinClass itinQ32,
+ string OpcodeStr, string Dt,
+ int fc, bit Commutable = 0> {
+ // 64-bit vector types.
+ def v8i8 : N3VD_cmp<op24, op23, 0b00, op11_8, op4, itinD16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v8i8, v8i8, fc, Commutable>;
+ def v4i16 : N3VD_cmp<op24, op23, 0b01, op11_8, op4, itinD16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v4i16, v4i16, fc, Commutable>;
+ def v2i32 : N3VD_cmp<op24, op23, 0b10, op11_8, op4, itinD32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v2i32, v2i32, fc, Commutable>;
+
+ // 128-bit vector types.
+ def v16i8 : N3VQ_cmp<op24, op23, 0b00, op11_8, op4, itinQ16,
+ OpcodeStr, !strconcat(Dt, "8"),
+ v16i8, v16i8, fc, Commutable>;
+ def v8i16 : N3VQ_cmp<op24, op23, 0b01, op11_8, op4, itinQ16,
+ OpcodeStr, !strconcat(Dt, "16"),
+ v8i16, v8i16, fc, Commutable>;
+ def v4i32 : N3VQ_cmp<op24, op23, 0b10, op11_8, op4, itinQ32,
+ OpcodeStr, !strconcat(Dt, "32"),
+ v4i32, v4i32, fc, Commutable>;
+}
+
// Neon 2-register vector intrinsics,
// element sizes of 8, 16 and 32 bits:
@@ -5026,67 +5067,67 @@ def : Pat<(v2i32 (trunc (ARMvshruImm (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
// Vector Comparisons.
// VCEQ : Vector Compare Equal
-defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vceq", "i", NEONvceq, 1>;
-def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
- NEONvceq, 1>;
-def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
- NEONvceq, 1>;
-def VCEQhd : N3VD<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16,
- NEONvceq, 1>,
+defm VCEQ : N3V_QHS_cmp<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vceq", "i", 0, 1>;
+def VCEQfd : N3VD_cmp<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
+ 0, 1>;
+def VCEQfq : N3VQ_cmp<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
+ 0, 1>;
+def VCEQhd : N3VD_cmp<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16,
+ 0, 1>,
Requires<[HasNEON, HasFullFP16]>;
-def VCEQhq : N3VQ<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16,
- NEONvceq, 1>,
+def VCEQhq : N3VQ_cmp<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16,
+ 0, 1>,
Requires<[HasNEON, HasFullFP16]>;
let TwoOperandAliasConstraint = "$Vm = $Vd" in
defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
- "$Vd, $Vm, #0", NEONvceqz>;
+ "$Vd, $Vm, #0", 0>;
// VCGE : Vector Compare Greater Than or Equal
-defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>;
-defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>;
-def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
- NEONvcge, 0>;
-def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
- NEONvcge, 0>;
-def VCGEhd : N3VD<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16,
- NEONvcge, 0>,
+defm VCGEs : N3V_QHS_cmp<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vcge", "s", 10, 0>;
+defm VCGEu : N3V_QHS_cmp<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vcge", "u", 2, 0>;
+def VCGEfd : N3VD_cmp<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
+ 10, 0>;
+def VCGEfq : N3VQ_cmp<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
+ 10, 0>;
+def VCGEhd : N3VD_cmp<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16,
+ 10, 0>,
Requires<[HasNEON, HasFullFP16]>;
-def VCGEhq : N3VQ<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16,
- NEONvcge, 0>,
+def VCGEhq : N3VQ_cmp<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16,
+ 10, 0>,
Requires<[HasNEON, HasFullFP16]>;
let TwoOperandAliasConstraint = "$Vm = $Vd" in {
defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
- "$Vd, $Vm, #0", NEONvcgez>;
+ "$Vd, $Vm, #0", 10>;
defm VCLEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
- "$Vd, $Vm, #0", NEONvclez>;
+ "$Vd, $Vm, #0", 13>;
}
// VCGT : Vector Compare Greater Than
-defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vcgt", "s", NEONvcgt, 0>;
-defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
- IIC_VSUBi4Q, "vcgt", "u", NEONvcgtu, 0>;
-def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
- NEONvcgt, 0>;
-def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
- NEONvcgt, 0>;
-def VCGThd : N3VD<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16,
- NEONvcgt, 0>,
+defm VCGTs : N3V_QHS_cmp<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vcgt", "s", 12, 0>;
+defm VCGTu : N3V_QHS_cmp<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+ IIC_VSUBi4Q, "vcgt", "u", 8, 0>;
+def VCGTfd : N3VD_cmp<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
+ 12, 0>;
+def VCGTfq : N3VQ_cmp<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
+ 12, 0>;
+def VCGThd : N3VD_cmp<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16,
+ 12, 0>,
Requires<[HasNEON, HasFullFP16]>;
-def VCGThq : N3VQ<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16,
- NEONvcgt, 0>,
+def VCGThq : N3VQ_cmp<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16,
+ 12, 0>,
Requires<[HasNEON, HasFullFP16]>;
let TwoOperandAliasConstraint = "$Vm = $Vd" in {
defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
- "$Vd, $Vm, #0", NEONvcgtz>;
+ "$Vd, $Vm, #0", 12>;
defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
- "$Vd, $Vm, #0", NEONvcltz>;
+ "$Vd, $Vm, #0", 11>;
}
// VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index cfeb13c6acb6..18bcbda44580 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -565,6 +565,13 @@ let isCall = 1,
4, IIC_Br,
[(ARMcall_nolink tGPR:$func)]>,
Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>;
+
+ // Also used for Thumb2
+ // push lr before the call
+ def tBL_PUSHLR : tPseudoInst<(outs), (ins GPRlr:$ra, pred:$p, thumb_bl_target:$func),
+ 4, IIC_Br,
+ []>,
+ Requires<[IsThumb]>, Sched<[WriteBr]>;
}
let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
@@ -592,6 +599,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
[(ARMbrjt tGPR:$target, tjumptable:$jt)]>,
Sched<[WriteBrTbl]> {
let Size = 2;
+ let isNotDuplicable = 1;
list<Predicate> Predicates = [IsThumb, IsThumb1Only];
}
}
@@ -1362,6 +1370,12 @@ let hasPostISelHook = 1, Defs = [CPSR] in {
[(set tGPR:$Rd, CPSR, (ARMsubc 0, tGPR:$Rn))]>,
Requires<[IsThumb1Only]>,
Sched<[WriteALU]>;
+
+ def tLSLSri : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, imm0_31:$imm5),
+ 2, IIC_iALUr,
+ [(set tGPR:$Rd, CPSR, (ARMlsls tGPR:$Rn, imm0_31:$imm5))]>,
+ Requires<[IsThumb1Only]>,
+ Sched<[WriteALU]>;
}
@@ -1465,7 +1479,7 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
// Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them
// and make use of the same compressed jump table format as Thumb-2.
let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1,
- isIndirectBranch = 1 in {
+ isIndirectBranch = 1, isNotDuplicable = 1 in {
def tTBB_JT : tPseudoInst<(outs),
(ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
IIC_Br, []>, Sched<[WriteBr]>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 7cbfaba7a8eb..25a45b39fa0c 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -45,7 +45,8 @@ def mve_shift_imm : AsmOperandClass {
let RenderMethod = "addImmOperands";
let DiagnosticString = "operand must be an immediate in the range [1,32]";
}
-def long_shift : Operand<i32> {
+def long_shift : Operand<i32>,
+ ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> {
let ParserMatchClass = mve_shift_imm;
let DecoderMethod = "DecodeLongShiftOperand";
}
@@ -2394,6 +2395,23 @@ def : Thumb2DSPPat<(int_arm_qadd(int_arm_qadd rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)),
(t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(saddsat rGPR:$Rm, rGPR:$Rn),
+ (t2QADD rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ssubsat rGPR:$Rm, rGPR:$Rn),
+ (t2QSUB rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+ (t2QDADD rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
+ (t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqadd8b rGPR:$Rm, rGPR:$Rn),
+ (t2QADD8 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqsub8b rGPR:$Rm, rGPR:$Rn),
+ (t2QSUB8 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqadd16b rGPR:$Rm, rGPR:$Rn),
+ (t2QADD16 rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(ARMqsub16b rGPR:$Rm, rGPR:$Rn),
+ (t2QSUB16 rGPR:$Rm, rGPR:$Rn)>;
+
// Signed/Unsigned add/subtract
def t2SASX : T2I_pam_intrinsics<0b010, 0b0000, "sasx", int_arm_sasx>;
@@ -4085,7 +4103,7 @@ def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
// Pseudo isntruction that combines movs + predicated rsbmi
// to implement integer ABS
-let usesCustomInserter = 1, Defs = [CPSR] in {
+let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
NoItinerary, []>, Requires<[IsThumb2]>;
}
@@ -4175,15 +4193,15 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm, list<dag>
}
let DecoderNamespace = "Thumb2CoProc" in {
-defm t2LDC : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm t2LDCL : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm t2LDC2 : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
-defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2LDC : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm t2LDCL : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm t2LDC2 : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
-defm t2STC : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm t2STCL : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm t2STC2 : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
-defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2STC : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm t2STCL : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl timm:$cop, timm:$CRd, addrmode5:$addr)]>;
+defm t2STC2 : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l timm:$cop, timm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
}
@@ -4368,8 +4386,8 @@ def t2MCR : t2MovRCopro<0b1110, "mcr", 0,
(outs),
(ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
c_imm:$CRm, imm0_7:$opc2),
- [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
- imm:$CRm, imm:$opc2)]>,
+ [(int_arm_mcr timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn,
+ timm:$CRm, timm:$opc2)]>,
ComplexDeprecationPredicate<"MCR">;
def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
(t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
@@ -4377,8 +4395,8 @@ def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
(outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
c_imm:$CRm, imm0_7:$opc2),
- [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
- imm:$CRm, imm:$opc2)]> {
+ [(int_arm_mcr2 timm:$cop, timm:$opc1, GPR:$Rt, timm:$CRn,
+ timm:$CRm, timm:$opc2)]> {
let Predicates = [IsThumb2, PreV8];
}
def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm",
@@ -4402,24 +4420,24 @@ def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm",
(t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
c_imm:$CRm, 0, pred:$p)>;
-def : T2v6Pat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
- (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+def : T2v6Pat<(int_arm_mrc timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2),
+ (t2MRC p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>;
-def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
- (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+def : T2v6Pat<(int_arm_mrc2 timm:$cop, timm:$opc1, timm:$CRn, timm:$CRm, timm:$opc2),
+ (t2MRC2 p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2)>;
/* from ARM core register to coprocessor */
def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0, (outs),
(ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2,
c_imm:$CRm),
- [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
- imm:$CRm)]>;
+ [(int_arm_mcrr timm:$cop, timm:$opc1, GPR:$Rt, GPR:$Rt2,
+ timm:$CRm)]>;
def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0, (outs),
(ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2,
c_imm:$CRm),
- [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
- GPR:$Rt2, imm:$CRm)]> {
+ [(int_arm_mcrr2 timm:$cop, timm:$opc1, GPR:$Rt,
+ GPR:$Rt2, timm:$CRm)]> {
let Predicates = [IsThumb2, PreV8];
}
@@ -4439,8 +4457,8 @@ def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1, (outs GPR:$Rt, GPR:$Rt2),
def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
"cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
- [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
- imm:$CRm, imm:$opc2)]> {
+ [(int_arm_cdp timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn,
+ timm:$CRm, timm:$opc2)]> {
let Inst{27-24} = 0b1110;
bits<4> opc1;
@@ -4465,8 +4483,8 @@ def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
"cdp2", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
- [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
- imm:$CRm, imm:$opc2)]> {
+ [(int_arm_cdp2 timm:$cop, timm:$opc1, timm:$CRd, timm:$CRn,
+ timm:$CRm, timm:$opc2)]> {
let Inst{27-24} = 0b1110;
bits<4> opc1;
@@ -5087,6 +5105,7 @@ def t2BF_LabelPseudo
: t2PseudoInst<(outs ), (ins pclabel:$cp), 0, NoItinerary, []> {
let isTerminator = 1;
let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB];
+ let hasNoSchedulingInfo = 1;
}
def t2BFi : t2BF<(ins bflabel_u4:$b_label, bflabel_s16:$label, pred:$p),
@@ -5217,11 +5236,13 @@ def t2LoopDec :
t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size),
4, IIC_Br, []>, Sched<[WriteBr]>;
-let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in {
+let isBranch = 1, isTerminator = 1, hasSideEffects = 1, Defs = [CPSR] in {
+// Set WhileLoopStart and LoopEnd to occupy 8 bytes because they may
+// get converted into t2CMP and t2Bcc.
def t2WhileLoopStart :
t2PseudoInst<(outs),
(ins rGPR:$elts, brtarget:$target),
- 4, IIC_Br, []>,
+ 8, IIC_Br, []>,
Sched<[WriteBr]>;
def t2LoopEnd :
@@ -5233,7 +5254,7 @@ def t2LoopEnd :
} // end isNotDuplicable
class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
- : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZR:$Rn, GPRwithZRnosp:$Rm, pred_noal:$fcond),
+ : V8_1MI<(outs rGPR:$Rd), (ins GPRwithZRnosp:$Rn, GPRwithZRnosp:$Rm, pred_noal:$fcond),
AddrModeNone, NoItinerary, iname, "$Rd, $Rn, $Rm, $fcond", "", pattern> {
bits<4> Rd;
bits<4> Rm;
@@ -5255,6 +5276,25 @@ def t2CSINC : CS<"csinc", 0b1001>;
def t2CSINV : CS<"csinv", 0b1010>;
def t2CSNEG : CS<"csneg", 0b1011>;
+let Predicates = [HasV8_1MMainline] in {
+ def : T2Pat<(ARMcsinc GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
+ (t2CSINC GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+ def : T2Pat<(ARMcsinv GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
+ (t2CSINV GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+ def : T2Pat<(ARMcsneg GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm),
+ (t2CSNEG GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+
+ multiclass ModifiedV8_1CSEL<Instruction Insn, dag modvalue> {
+ def : T2Pat<(ARMcmov modvalue, GPRwithZR:$tval, cmovpred:$imm),
+ (Insn GPRwithZR:$tval, GPRwithZR:$fval, imm0_31:$imm)>;
+ def : T2Pat<(ARMcmov GPRwithZR:$tval, modvalue, cmovpred:$imm),
+ (Insn GPRwithZR:$tval, GPRwithZR:$fval,
+ (i32 (inv_cond_XFORM imm:$imm)))>;
+ }
+ defm : ModifiedV8_1CSEL<t2CSINC, (add rGPR:$fval, 1)>;
+ defm : ModifiedV8_1CSEL<t2CSINV, (xor rGPR:$fval, -1)>;
+ defm : ModifiedV8_1CSEL<t2CSNEG, (sub 0, rGPR:$fval)>;
+}
// CS aliases.
let Predicates = [HasV8_1MMainline] in {
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index a0dd25de07ee..fdd961bfbb2f 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -10,7 +10,7 @@
//
//===----------------------------------------------------------------------===//
-def SDT_CMPFP0 : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisVT<1, i32>]>;
+def SDT_CMPFP0 : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
SDTCisSameAs<1, 2>]>;
def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
@@ -19,7 +19,7 @@ def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
def SDT_VMOVSR : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i32>]>;
def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInGlue, SDNPOutGlue]>;
-def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMFCmp, [SDNPOutGlue]>;
+def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutGlue]>;
def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
def arm_fmrrd : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>;
@@ -324,7 +324,7 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
// However, there is no UAL syntax for them, so we keep them around for
// (dis)assembly only.
multiclass vfp_ldstx_mult<string asm, bit L_bit> {
- let Predicates = [HasFPRegs] in {
+ let Predicates = [HasFPRegs], hasNoSchedulingInfo = 1 in {
// Unknown precision
def XIA :
AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
@@ -548,12 +548,12 @@ let Defs = [FPSCR_NZCV] in {
def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
(outs), (ins DPR:$Dd, DPR:$Dm),
IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
- [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 1))]>;
+ [/* For disassembly only; pattern left blank */]>;
def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
(outs), (ins SPR:$Sd, SPR:$Sm),
IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
- [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 1))]> {
+ [/* For disassembly only; pattern left blank */]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
let D = VFPNeonA8Domain;
@@ -562,17 +562,17 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
(outs), (ins HPR:$Sd, HPR:$Sm),
IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
- [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 1))]>;
+ [/* For disassembly only; pattern left blank */]>;
def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
(outs), (ins DPR:$Dd, DPR:$Dm),
IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm",
- [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 0))]>;
+ [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
(outs), (ins SPR:$Sd, SPR:$Sm),
IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
- [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 0))]> {
+ [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
let D = VFPNeonA8Domain;
@@ -581,7 +581,7 @@ def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
def VCMPH : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
(outs), (ins HPR:$Sd, HPR:$Sm),
IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
- [(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 0))]>;
+ [(arm_cmpfp HPR:$Sd, HPR:$Sm)]>;
} // Defs = [FPSCR_NZCV]
//===----------------------------------------------------------------------===//
@@ -611,7 +611,7 @@ let Defs = [FPSCR_NZCV] in {
def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
(outs), (ins DPR:$Dd),
IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
- [(arm_cmpfp0 (f64 DPR:$Dd), (i32 1))]> {
+ [/* For disassembly only; pattern left blank */]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
}
@@ -619,7 +619,7 @@ def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
(outs), (ins SPR:$Sd),
IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
- [(arm_cmpfp0 SPR:$Sd, (i32 1))]> {
+ [/* For disassembly only; pattern left blank */]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
@@ -631,7 +631,7 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
(outs), (ins HPR:$Sd),
IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0",
- [(arm_cmpfp0 HPR:$Sd, (i32 1))]> {
+ [/* For disassembly only; pattern left blank */]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
}
@@ -639,7 +639,7 @@ def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
(outs), (ins DPR:$Dd),
IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0",
- [(arm_cmpfp0 (f64 DPR:$Dd), (i32 0))]> {
+ [(arm_cmpfp0 (f64 DPR:$Dd))]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
}
@@ -647,7 +647,7 @@ def VCMPZD : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
(outs), (ins SPR:$Sd),
IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0",
- [(arm_cmpfp0 SPR:$Sd, (i32 0))]> {
+ [(arm_cmpfp0 SPR:$Sd)]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
@@ -659,7 +659,7 @@ def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
def VCMPZH : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
(outs), (ins HPR:$Sd),
IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0",
- [(arm_cmpfp0 HPR:$Sd, (i32 0))]> {
+ [(arm_cmpfp0 HPR:$Sd)]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
}
@@ -1732,7 +1732,8 @@ def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0,
(outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
- IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []> {
+ IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []>,
+ Sched<[WriteFPCVT]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
let D = VFPNeonA8Domain;
@@ -1740,7 +1741,8 @@ def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0,
def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1,
(outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
- IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []> {
+ IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []>,
+ Sched<[WriteFPCVT]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
let D = VFPNeonA8Domain;
@@ -1748,7 +1750,8 @@ def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1,
def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1,
(outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
- IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []> {
+ IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []>,
+ Sched<[WriteFPCVT]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines on A8.
let D = VFPNeonA8Domain;
@@ -2297,6 +2300,8 @@ class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
let Inst{6-5} = 0b00;
let Inst{4} = 1;
let Inst{3-0} = 0b0000;
+ let Unpredictable{7-5} = 0b111;
+ let Unpredictable{3-0} = 0b1111;
}
let DecoderMethod = "DecodeForVMRSandVMSR" in {
@@ -2370,63 +2375,65 @@ class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
// Instruction operand.
- bits<4> src;
-
- // Encode instruction operand.
- let Inst{15-12} = src;
+ bits<4> Rt;
let Inst{27-20} = 0b11101110;
let Inst{19-16} = opc19_16;
+ let Inst{15-12} = Rt;
let Inst{11-8} = 0b1010;
let Inst{7} = 0;
+ let Inst{6-5} = 0b00;
let Inst{4} = 1;
+ let Inst{3-0} = 0b0000;
let Predicates = [HasVFP2];
+ let Unpredictable{7-5} = 0b111;
+ let Unpredictable{3-0} = 0b1111;
}
let DecoderMethod = "DecodeForVMRSandVMSR" in {
let Defs = [FPSCR] in {
let Predicates = [HasFPRegs] in
// Application level GPR -> FPSCR
- def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPRnopc:$src),
- "vmsr", "\tfpscr, $src",
- [(int_arm_set_fpscr GPRnopc:$src)]>;
+ def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPRnopc:$Rt),
+ "vmsr", "\tfpscr, $Rt",
+ [(int_arm_set_fpscr GPRnopc:$Rt)]>;
// System level GPR -> FPEXC
- def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPRnopc:$src),
- "vmsr", "\tfpexc, $src", []>;
+ def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPRnopc:$Rt),
+ "vmsr", "\tfpexc, $Rt", []>;
// System level GPR -> FPSID
- def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPRnopc:$src),
- "vmsr", "\tfpsid, $src", []>;
- def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPRnopc:$src),
- "vmsr", "\tfpinst, $src", []>;
- def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPRnopc:$src),
- "vmsr", "\tfpinst2, $src", []>;
+ def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPRnopc:$Rt),
+ "vmsr", "\tfpsid, $Rt", []>;
+ def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPRnopc:$Rt),
+ "vmsr", "\tfpinst, $Rt", []>;
+ def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPRnopc:$Rt),
+ "vmsr", "\tfpinst2, $Rt", []>;
}
let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
// System level GPR -> FPSCR with context saving for security extensions
- def VMSR_FPCXTNS : MovToVFP<0b1110 /* fpcxtns */, (outs), (ins GPR:$src),
- "vmsr", "\tfpcxtns, $src", []>;
+ def VMSR_FPCXTNS : MovToVFP<0b1110 /* fpcxtns */, (outs), (ins GPR:$Rt),
+ "vmsr", "\tfpcxtns, $Rt", []>;
}
let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
// System level GPR -> FPSCR with context saving for security extensions
- def VMSR_FPCXTS : MovToVFP<0b1111 /* fpcxts */, (outs), (ins GPR:$src),
- "vmsr", "\tfpcxts, $src", []>;
+ def VMSR_FPCXTS : MovToVFP<0b1111 /* fpcxts */, (outs), (ins GPR:$Rt),
+ "vmsr", "\tfpcxts, $Rt", []>;
}
let Predicates = [HasV8_1MMainline, HasFPRegs] in {
// System level GPR -> FPSCR_NZCVQC
def VMSR_FPSCR_NZCVQC
: MovToVFP<0b0010 /* fpscr_nzcvqc */,
- (outs cl_FPSCR_NZCV:$fpscr_out), (ins GPR:$src),
- "vmsr", "\tfpscr_nzcvqc, $src", []>;
+ (outs cl_FPSCR_NZCV:$fpscr_out), (ins GPR:$Rt),
+ "vmsr", "\tfpscr_nzcvqc, $Rt", []>;
}
let Predicates = [HasV8_1MMainline, HasMVEInt] in {
// System level GPR -> VPR/P0
let Defs = [VPR] in
- def VMSR_VPR : MovToVFP<0b1100 /* vpr */, (outs), (ins GPR:$src),
- "vmsr", "\tvpr, $src", []>;
+ def VMSR_VPR : MovToVFP<0b1100 /* vpr */, (outs), (ins GPR:$Rt),
+ "vmsr", "\tvpr, $Rt", []>;
- def VMSR_P0 : MovToVFP<0b1101 /* p0 */, (outs VCCR:$cond), (ins GPR:$src),
- "vmsr", "\tp0, $src", []>;
+ def VMSR_P0 : MovToVFP<0b1101 /* p0 */, (outs VCCR:$cond), (ins GPR:$Rt),
+ "vmsr", "\tp0, $Rt", []>;
}
}
@@ -2614,7 +2621,8 @@ def VSCCLRMD : VFPXI<(outs), (ins pred:$p, fp_dreglist_with_vpr:$regs, variable_
let Inst{21-16} = 0b011111;
let Inst{15-12} = regs{11-8};
let Inst{11-8} = 0b1011;
- let Inst{7-0} = regs{7-0};
+ let Inst{7-1} = regs{7-1};
+ let Inst{0} = 0;
let DecoderMethod = "DecodeVSCCLRM";
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 4485a474a6df..8e5e474c0f59 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -34,7 +34,7 @@ public:
ARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI,
const ARMRegisterBankInfo &RBI);
- bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ bool select(MachineInstr &I) override;
static const char *getName() { return DEBUG_TYPE; }
private:
@@ -210,8 +210,8 @@ static const TargetRegisterClass *guessRegClass(unsigned Reg,
static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
- unsigned DstReg = I.getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ Register DstReg = I.getOperand(0).getReg();
+ if (Register::isPhysicalRegister(DstReg))
return true;
const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI);
@@ -236,17 +236,17 @@ static bool selectMergeValues(MachineInstrBuilder &MIB,
// We only support G_MERGE_VALUES as a way to stick together two scalar GPRs
// into one DPR.
- unsigned VReg0 = MIB->getOperand(0).getReg();
+ Register VReg0 = MIB->getOperand(0).getReg();
(void)VReg0;
assert(MRI.getType(VReg0).getSizeInBits() == 64 &&
RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::FPRRegBankID &&
"Unsupported operand for G_MERGE_VALUES");
- unsigned VReg1 = MIB->getOperand(1).getReg();
+ Register VReg1 = MIB->getOperand(1).getReg();
(void)VReg1;
assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
"Unsupported operand for G_MERGE_VALUES");
- unsigned VReg2 = MIB->getOperand(2).getReg();
+ Register VReg2 = MIB->getOperand(2).getReg();
(void)VReg2;
assert(MRI.getType(VReg2).getSizeInBits() == 32 &&
RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::GPRRegBankID &&
@@ -268,17 +268,17 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB,
// We only support G_UNMERGE_VALUES as a way to break up one DPR into two
// GPRs.
- unsigned VReg0 = MIB->getOperand(0).getReg();
+ Register VReg0 = MIB->getOperand(0).getReg();
(void)VReg0;
assert(MRI.getType(VReg0).getSizeInBits() == 32 &&
RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::GPRRegBankID &&
"Unsupported operand for G_UNMERGE_VALUES");
- unsigned VReg1 = MIB->getOperand(1).getReg();
+ Register VReg1 = MIB->getOperand(1).getReg();
(void)VReg1;
assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
"Unsupported operand for G_UNMERGE_VALUES");
- unsigned VReg2 = MIB->getOperand(2).getReg();
+ Register VReg2 = MIB->getOperand(2).getReg();
(void)VReg2;
assert(MRI.getType(VReg2).getSizeInBits() == 64 &&
RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::FPRRegBankID &&
@@ -833,8 +833,7 @@ void ARMInstructionSelector::renderVFPF64Imm(
NewInstBuilder.addImm(FPImmEncoding);
}
-bool ARMInstructionSelector::select(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
+bool ARMInstructionSelector::select(MachineInstr &I) {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -851,7 +850,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
using namespace TargetOpcode;
- if (selectImpl(I, CoverageInfo))
+ if (selectImpl(I, *CoverageInfo))
return true;
MachineInstrBuilder MIB{MF, I};
@@ -874,10 +873,10 @@ bool ARMInstructionSelector::select(MachineInstr &I,
MIB.addImm(1).add(predOps(ARMCC::AL)).add(condCodeOp());
if (isSExt) {
- unsigned SExtResult = I.getOperand(0).getReg();
+ Register SExtResult = I.getOperand(0).getReg();
// Use a new virtual register for the result of the AND
- unsigned AndResult = MRI.createVirtualRegister(&ARM::GPRRegClass);
+ Register AndResult = MRI.createVirtualRegister(&ARM::GPRRegClass);
I.getOperand(0).setReg(AndResult);
auto InsertBefore = std::next(I.getIterator());
@@ -928,7 +927,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
assert(MRI.getType(SrcReg).getSizeInBits() == 64 && "Unsupported size");
assert(MRI.getType(DstReg).getSizeInBits() <= 32 && "Unsupported size");
- unsigned IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass);
+ Register IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass);
auto InsertBefore = std::next(I.getIterator());
auto MovI =
BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::VMOVRRD))
@@ -1039,7 +1038,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
case G_FCMP: {
assert(STI.hasVFP2Base() && "Can't select fcmp without VFP");
- unsigned OpReg = I.getOperand(2).getReg();
+ Register OpReg = I.getOperand(2).getReg();
unsigned Size = MRI.getType(OpReg).getSizeInBits();
if (Size == 64 && !STI.hasFP64()) {
@@ -1077,12 +1076,12 @@ bool ARMInstructionSelector::select(MachineInstr &I,
case G_STORE:
case G_LOAD: {
const auto &MemOp = **I.memoperands_begin();
- if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+ if (MemOp.isAtomic()) {
LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
return false;
}
- unsigned Reg = I.getOperand(0).getReg();
+ Register Reg = I.getOperand(0).getReg();
unsigned RegBank = RBI.getRegBank(Reg, MRI, TRI)->getID();
LLT ValTy = MRI.getType(Reg);
@@ -1097,9 +1096,9 @@ bool ARMInstructionSelector::select(MachineInstr &I,
if (ValSize == 1 && NewOpc == Opcodes.STORE8) {
// Before storing a 1-bit value, make sure to clear out any unneeded bits.
- unsigned OriginalValue = I.getOperand(0).getReg();
+ Register OriginalValue = I.getOperand(0).getReg();
- unsigned ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass);
+ Register ValueToStore = MRI.createVirtualRegister(&ARM::GPRRegClass);
I.getOperand(0).setReg(ValueToStore);
auto InsertBefore = I.getIterator();
@@ -1159,7 +1158,7 @@ bool ARMInstructionSelector::select(MachineInstr &I,
case G_PHI: {
I.setDesc(TII.get(PHI));
- unsigned DstReg = I.getOperand(0).getReg();
+ Register DstReg = I.getOperand(0).getReg();
const TargetRegisterClass *RC = guessRegClass(DstReg, MRI, TRI, RBI);
if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
break;
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 73a57b297ad6..81414e6d76fe 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -84,6 +84,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
.legalForCartesianProduct({s8, s16, s32}, {s1, s8, s16});
+ getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+
getActionDefinitionsBuilder({G_MUL, G_AND, G_OR, G_XOR})
.legalFor({s32})
.minScalar(0, s32);
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 90a1ce238c3f..4a193fed04a3 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -509,7 +509,7 @@ void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
Offset = MO.getImm() - WordOffset * getImmScale(Opc);
// If storing the base register, it needs to be reset first.
- unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
+ Register InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
MO.setImm(Offset);
@@ -859,7 +859,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
// Determine list of registers and list of implicit super-register defs.
for (const MachineInstr *MI : Cand.Instrs) {
const MachineOperand &MO = getLoadStoreRegOp(*MI);
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
bool IsKill = MO.isKill();
if (IsKill)
KilledRegs.insert(Reg);
@@ -874,7 +874,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
if (!MO.isReg() || !MO.isDef() || MO.isDead())
continue;
assert(MO.isImplicit());
- unsigned DefReg = MO.getReg();
+ Register DefReg = MO.getReg();
if (is_contained(ImpDefs, DefReg))
continue;
@@ -893,7 +893,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
iterator InsertBefore = std::next(iterator(LatestMI));
MachineBasicBlock &MBB = *LatestMI->getParent();
unsigned Offset = getMemoryOpOffset(*First);
- unsigned Base = getLoadStoreBaseOp(*First).getReg();
+ Register Base = getLoadStoreBaseOp(*First).getReg();
bool BaseKill = LatestMI->killsRegister(Base);
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg);
@@ -1005,7 +1005,7 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
const MachineInstr *MI = MemOps[SIndex].MI;
int Offset = MemOps[SIndex].Offset;
const MachineOperand &PMO = getLoadStoreRegOp(*MI);
- unsigned PReg = PMO.getReg();
+ Register PReg = PMO.getReg();
unsigned PRegNum = PMO.isUndef() ? std::numeric_limits<unsigned>::max()
: TRI->getEncodingValue(PReg);
unsigned Latest = SIndex;
@@ -1052,7 +1052,7 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
if (NewOffset != Offset + (int)Size)
break;
const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg == ARM::SP || Reg == ARM::PC)
break;
if (Count == Limit)
@@ -1261,7 +1261,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
if (isThumb1) return false;
const MachineOperand &BaseOP = MI->getOperand(0);
- unsigned Base = BaseOP.getReg();
+ Register Base = BaseOP.getReg();
bool BaseKill = BaseOP.isKill();
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
@@ -1387,7 +1387,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
// FIXME: Use LDM/STM with single register instead.
if (isThumb1) return false;
- unsigned Base = getLoadStoreBaseOp(*MI).getReg();
+ Register Base = getLoadStoreBaseOp(*MI).getReg();
bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
unsigned Opcode = MI->getOpcode();
DebugLoc DL = MI->getDebugLoc();
@@ -1512,7 +1512,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
// Behaviour for writeback is undefined if base register is the same as one
// of the others.
const MachineOperand &BaseOp = MI.getOperand(2);
- unsigned Base = BaseOp.getReg();
+ Register Base = BaseOp.getReg();
const MachineOperand &Reg0Op = MI.getOperand(0);
const MachineOperand &Reg1Op = MI.getOperand(1);
if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
@@ -1655,9 +1655,9 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
return false;
const MachineOperand &BaseOp = MI->getOperand(2);
- unsigned BaseReg = BaseOp.getReg();
- unsigned EvenReg = MI->getOperand(0).getReg();
- unsigned OddReg = MI->getOperand(1).getReg();
+ Register BaseReg = BaseOp.getReg();
+ Register EvenReg = MI->getOperand(0).getReg();
+ Register OddReg = MI->getOperand(1).getReg();
unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false);
@@ -1783,8 +1783,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
if (isMemoryOp(*MBBI)) {
unsigned Opcode = MBBI->getOpcode();
const MachineOperand &MO = MBBI->getOperand(0);
- unsigned Reg = MO.getReg();
- unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
+ Register Reg = MO.getReg();
+ Register Base = getLoadStoreBaseOp(*MBBI).getReg();
unsigned PredReg = 0;
ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg);
int Offset = getMemoryOpOffset(*MBBI);
@@ -2121,7 +2121,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
MachineOperand &MO = I->getOperand(j);
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (MO.isDef() && TRI->regsOverlap(Reg, Base))
return false;
if (Reg != Base && !MemRegs.count(Reg))
@@ -2415,7 +2415,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
int Opc = MI.getOpcode();
bool isLd = isLoadSingle(Opc);
- unsigned Base = MI.getOperand(1).getReg();
+ Register Base = MI.getOperand(1).getReg();
int Offset = getMemoryOpOffset(MI);
bool StopHere = false;
auto FindBases = [&] (Base2InstMap &Base2Ops, BaseVec &Bases) {
diff --git a/lib/Target/ARM/ARMLowOverheadLoops.cpp b/lib/Target/ARM/ARMLowOverheadLoops.cpp
index cedf3bd3c74e..e1c5a9c3e223 100644
--- a/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -11,8 +11,7 @@
/// The expectation is that the loop contains three pseudo instructions:
/// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
/// form should be in the preheader, whereas the while form should be in the
-/// preheaders only predecessor. TODO: Could DoLoopStart get moved into the
-/// pre-preheader?
+/// preheaders only predecessor.
/// - t2LoopDec - placed within in the loop body.
/// - t2LoopEnd - the loop latch terminator.
///
@@ -35,6 +34,7 @@ using namespace llvm;
namespace {
class ARMLowOverheadLoops : public MachineFunctionPass {
+ MachineFunction *MF = nullptr;
const ARMBaseInstrInfo *TII = nullptr;
MachineRegisterInfo *MRI = nullptr;
std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
@@ -52,17 +52,6 @@ namespace {
bool runOnMachineFunction(MachineFunction &MF) override;
- bool ProcessLoop(MachineLoop *ML);
-
- void RevertWhile(MachineInstr *MI) const;
-
- void RevertLoopDec(MachineInstr *MI) const;
-
- void RevertLoopEnd(MachineInstr *MI) const;
-
- void Expand(MachineLoop *ML, MachineInstr *Start,
- MachineInstr *Dec, MachineInstr *End, bool Revert);
-
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
@@ -71,36 +60,156 @@ namespace {
StringRef getPassName() const override {
return ARM_LOW_OVERHEAD_LOOPS_NAME;
}
+
+ private:
+ bool ProcessLoop(MachineLoop *ML);
+
+ MachineInstr * IsSafeToDefineLR(MachineInstr *MI);
+
+ bool RevertNonLoops();
+
+ void RevertWhile(MachineInstr *MI) const;
+
+ bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const;
+
+ void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
+
+ void Expand(MachineLoop *ML, MachineInstr *Start,
+ MachineInstr *InsertPt, MachineInstr *Dec,
+ MachineInstr *End, bool Revert);
+
};
}
-
+
char ARMLowOverheadLoops::ID = 0;
INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
false, false)
-bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) {
- if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB())
+bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
+ const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget());
+ if (!ST.hasLOB())
return false;
- LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n");
+ MF = &mf;
+ LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
auto &MLI = getAnalysis<MachineLoopInfo>();
- MRI = &MF.getRegInfo();
- TII = static_cast<const ARMBaseInstrInfo*>(
- MF.getSubtarget().getInstrInfo());
- BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF));
+ MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
+ MRI = &MF->getRegInfo();
+ TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
+ BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
BBUtils->computeAllBlockSizes();
- BBUtils->adjustBBOffsetsAfter(&MF.front());
+ BBUtils->adjustBBOffsetsAfter(&MF->front());
bool Changed = false;
for (auto ML : MLI) {
if (!ML->getParentLoop())
Changed |= ProcessLoop(ML);
}
+ Changed |= RevertNonLoops();
return Changed;
}
+static bool IsLoopStart(MachineInstr &MI) {
+ return MI.getOpcode() == ARM::t2DoLoopStart ||
+ MI.getOpcode() == ARM::t2WhileLoopStart;
+}
+
+template<typename T>
+static MachineInstr* SearchForDef(MachineInstr *Begin, T End, unsigned Reg) {
+ for(auto &MI : make_range(T(Begin), End)) {
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
+ continue;
+ return &MI;
+ }
+ }
+ return nullptr;
+}
+
+static MachineInstr* SearchForUse(MachineInstr *Begin,
+ MachineBasicBlock::iterator End,
+ unsigned Reg) {
+ for(auto &MI : make_range(MachineBasicBlock::iterator(Begin), End)) {
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
+ continue;
+ return &MI;
+ }
+ }
+ return nullptr;
+}
+
+// Is it safe to define LR with DLS/WLS?
+// LR can defined if it is the operand to start, because it's the same value,
+// or if it's going to be equivalent to the operand to Start.
+MachineInstr *ARMLowOverheadLoops::IsSafeToDefineLR(MachineInstr *Start) {
+
+ auto IsMoveLR = [](MachineInstr *MI, unsigned Reg) {
+ return MI->getOpcode() == ARM::tMOVr &&
+ MI->getOperand(0).getReg() == ARM::LR &&
+ MI->getOperand(1).getReg() == Reg &&
+ MI->getOperand(2).getImm() == ARMCC::AL;
+ };
+
+ MachineBasicBlock *MBB = Start->getParent();
+ unsigned CountReg = Start->getOperand(0).getReg();
+ // Walk forward and backward in the block to find the closest instructions
+ // that define LR. Then also filter them out if they're not a mov lr.
+ MachineInstr *PredLRDef = SearchForDef(Start, MBB->rend(), ARM::LR);
+ if (PredLRDef && !IsMoveLR(PredLRDef, CountReg))
+ PredLRDef = nullptr;
+
+ MachineInstr *SuccLRDef = SearchForDef(Start, MBB->end(), ARM::LR);
+ if (SuccLRDef && !IsMoveLR(SuccLRDef, CountReg))
+ SuccLRDef = nullptr;
+
+ // We've either found one, two or none mov lr instructions... Now figure out
+ // if they are performing the equilvant mov that the Start instruction will.
+ // Do this by scanning forward and backward to see if there's a def of the
+ // register holding the count value. If we find a suitable def, return it as
+ // the insert point. Later, if InsertPt != Start, then we can remove the
+ // redundant instruction.
+ if (SuccLRDef) {
+ MachineBasicBlock::iterator End(SuccLRDef);
+ if (!SearchForDef(Start, End, CountReg)) {
+ return SuccLRDef;
+ } else
+ SuccLRDef = nullptr;
+ }
+ if (PredLRDef) {
+ MachineBasicBlock::reverse_iterator End(PredLRDef);
+ if (!SearchForDef(Start, End, CountReg)) {
+ return PredLRDef;
+ } else
+ PredLRDef = nullptr;
+ }
+
+ // We can define LR because LR already contains the same value.
+ if (Start->getOperand(0).getReg() == ARM::LR)
+ return Start;
+
+ // We've found no suitable LR def and Start doesn't use LR directly. Can we
+ // just define LR anyway?
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ LivePhysRegs LiveRegs(*TRI);
+ LiveRegs.addLiveOuts(*MBB);
+
+ // Not if we've haven't found a suitable mov and LR is live out.
+ if (LiveRegs.contains(ARM::LR))
+ return nullptr;
+
+ // If LR is not live out, we can insert the instruction if nothing else
+ // uses LR after it.
+ if (!SearchForUse(Start, MBB->end(), ARM::LR))
+ return Start;
+
+ LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find suitable insertion point for"
+ << " LR\n");
+ return nullptr;
+}
+
bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
bool Changed = false;
@@ -111,15 +220,10 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
- auto IsLoopStart = [](MachineInstr &MI) {
- return MI.getOpcode() == ARM::t2DoLoopStart ||
- MI.getOpcode() == ARM::t2WhileLoopStart;
- };
-
// Search the given block for a loop start instruction. If one isn't found,
// and there's only one predecessor block, search that one too.
std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
- [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
+ [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
for (auto &MI : *MBB) {
if (IsLoopStart(MI))
return &MI;
@@ -165,41 +269,62 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
Dec = &MI;
else if (MI.getOpcode() == ARM::t2LoopEnd)
End = &MI;
- else if (MI.getDesc().isCall())
+ else if (IsLoopStart(MI))
+ Start = &MI;
+ else if (MI.getDesc().isCall()) {
// TODO: Though the call will require LE to execute again, does this
// mean we should revert? Always executing LE hopefully should be
// faster than performing a sub,cmp,br or even subs,br.
Revert = true;
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
+ }
- if (!Dec)
+ if (!Dec || End)
continue;
- // If we find that we load/store LR between LoopDec and LoopEnd, expect
- // that the decremented value has been spilled to the stack. Because
- // this value isn't actually going to be produced until the latch, by LE,
- // we would need to generate a real sub. The value is also likely to be
- // reloaded for use of LoopEnd - in which in case we'd need to perform
- // an add because it gets negated again by LE! The other option is to
- // then generate the other form of LE which doesn't perform the sub.
- if (MI.mayLoad() || MI.mayStore())
- Revert =
- MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR;
+ // If we find that LR has been written or read between LoopDec and
+ // LoopEnd, expect that the decremented value is being used else where.
+ // Because this value isn't actually going to be produced until the
+ // latch, by LE, we would need to generate a real sub. The value is also
+ // likely to be copied/reloaded for use of LoopEnd - in which in case
+ // we'd need to perform an add because it gets subtracted again by LE!
+ // The other option is to then generate the other form of LE which doesn't
+ // perform the sub.
+ for (auto &MO : MI.operands()) {
+ if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() &&
+ MO.getReg() == ARM::LR) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI);
+ Revert = true;
+ break;
+ }
+ }
}
if (Dec && End && Revert)
break;
}
+ LLVM_DEBUG(if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
+ if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
+ if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;);
+
if (!Start && !Dec && !End) {
LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n");
return Changed;
- } if (!(Start && Dec && End)) {
- report_fatal_error("Failed to find all loop components");
+ } else if (!(Start && Dec && End)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find all loop components.\n");
+ return false;
}
- if (!End->getOperand(1).isMBB() ||
- End->getOperand(1).getMBB() != ML->getHeader())
- report_fatal_error("Expected LoopEnd to target Loop Header");
+ if (!End->getOperand(1).isMBB())
+ report_fatal_error("Expected LoopEnd to target basic block");
+
+ // TODO Maybe there's cases where the target doesn't have to be the header,
+ // but for now be safe and revert.
+ if (End->getOperand(1).getMBB() != ML->getHeader()) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
+ Revert = true;
+ }
// The WLS and LE instructions have 12-bits for the label offset. WLS
// requires a positive offset, while LE uses negative.
@@ -216,41 +341,57 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
Revert = true;
}
- LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start
- << " - Found Loop Dec: " << *Dec
- << " - Found Loop End: " << *End);
+ MachineInstr *InsertPt = Revert ? nullptr : IsSafeToDefineLR(Start);
+ if (!InsertPt) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
+ Revert = true;
+ } else
+ LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
- Expand(ML, Start, Dec, End, Revert);
+ Expand(ML, Start, InsertPt, Dec, End, Revert);
return true;
}
// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
// beq that branches to the exit branch.
-// FIXME: Need to check that we're not trashing the CPSR when generating the
-// cmp. We could also try to generate a cbz if the value in LR is also in
+// TODO: We could also try to generate a cbz if the value in LR is also in
// another low register.
void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
MachineBasicBlock *MBB = MI->getParent();
MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(ARM::t2CMPri));
- MIB.addReg(ARM::LR);
+ MIB.add(MI->getOperand(0));
MIB.addImm(0);
MIB.addImm(ARMCC::AL);
- MIB.addReg(ARM::CPSR);
+ MIB.addReg(ARM::NoRegister);
+
+ MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
+ unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
+ ARM::tBcc : ARM::t2Bcc;
- // TODO: Try to use tBcc instead
- MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
MIB.add(MI->getOperand(1)); // branch target
MIB.addImm(ARMCC::EQ); // condition code
MIB.addReg(ARM::CPSR);
MI->eraseFromParent();
}
-// TODO: Check flags so that we can possibly generate a tSubs or tSub.
-void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
+bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI,
+ bool AllowFlags) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
MachineBasicBlock *MBB = MI->getParent();
+
+ // If nothing uses or defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
+ bool SetFlags = false;
+ if (AllowFlags) {
+ if (auto *Def = SearchForDef(MI, MBB->end(), ARM::CPSR)) {
+ if (!SearchForUse(MI, MBB->end(), ARM::CPSR) &&
+ Def->getOpcode() == ARM::t2LoopEnd)
+ SetFlags = true;
+ }
+ }
+
MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(ARM::t2SUBri));
MIB.addDef(ARM::LR);
@@ -258,28 +399,39 @@ void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
MIB.add(MI->getOperand(2));
MIB.addImm(ARMCC::AL);
MIB.addReg(0);
- MIB.addReg(0);
+
+ if (SetFlags) {
+ MIB.addReg(ARM::CPSR);
+ MIB->getOperand(5).setIsDef(true);
+ } else
+ MIB.addReg(0);
+
MI->eraseFromParent();
+ return SetFlags;
}
// Generate a subs, or sub and cmp, and a branch instead of an LE.
-// FIXME: Need to check that we're not trashing the CPSR when generating
-// the cmp.
-void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const {
+void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
- // Create cmp
MachineBasicBlock *MBB = MI->getParent();
- MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(ARM::t2CMPri));
- MIB.addReg(ARM::LR);
- MIB.addImm(0);
- MIB.addImm(ARMCC::AL);
- MIB.addReg(ARM::CPSR);
+ // Create cmp
+ if (!SkipCmp) {
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(ARM::t2CMPri));
+ MIB.addReg(ARM::LR);
+ MIB.addImm(0);
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(ARM::NoRegister);
+ }
+
+ MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
+ unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
+ ARM::tBcc : ARM::t2Bcc;
- // TODO Try to use tBcc instead.
// Create bne
- MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
MIB.add(MI->getOperand(1)); // branch target
MIB.addImm(ARMCC::NE); // condition code
MIB.addReg(ARM::CPSR);
@@ -287,33 +439,13 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const {
}
void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
+ MachineInstr *InsertPt,
MachineInstr *Dec, MachineInstr *End,
bool Revert) {
- auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) {
- // The trip count should already been held in LR since the instructions
- // within the loop can only read and write to LR. So, there should be a
- // mov to setup the count. WLS/DLS perform this move, so find the original
- // and delete it - inserting WLS/DLS in its place.
- MachineBasicBlock *MBB = Start->getParent();
- MachineInstr *InsertPt = Start;
- for (auto &I : MRI->def_instructions(ARM::LR)) {
- if (I.getParent() != MBB)
- continue;
-
- // Always execute.
- if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL)
- continue;
-
- // Only handle move reg, if the trip count it will need moving into a reg
- // before the setup instruction anyway.
- if (!I.getDesc().isMoveReg() ||
- !I.getOperand(1).isIdenticalTo(Start->getOperand(0)))
- continue;
- InsertPt = &I;
- break;
- }
-
+ auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start,
+ MachineInstr *InsertPt) {
+ MachineBasicBlock *MBB = InsertPt->getParent();
unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ?
ARM::t2DLS : ARM::t2WLS;
MachineInstrBuilder MIB =
@@ -369,16 +501,54 @@ void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
RevertWhile(Start);
else
Start->eraseFromParent();
- RevertLoopDec(Dec);
- RevertLoopEnd(End);
+ bool FlagsAlreadySet = RevertLoopDec(Dec, true);
+ RevertLoopEnd(End, FlagsAlreadySet);
} else {
- Start = ExpandLoopStart(ML, Start);
+ Start = ExpandLoopStart(ML, Start, InsertPt);
RemoveDeadBranch(Start);
End = ExpandLoopEnd(ML, Dec, End);
RemoveDeadBranch(End);
}
}
+bool ARMLowOverheadLoops::RevertNonLoops() {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
+ bool Changed = false;
+
+ for (auto &MBB : *MF) {
+ SmallVector<MachineInstr*, 4> Starts;
+ SmallVector<MachineInstr*, 4> Decs;
+ SmallVector<MachineInstr*, 4> Ends;
+
+ for (auto &I : MBB) {
+ if (IsLoopStart(I))
+ Starts.push_back(&I);
+ else if (I.getOpcode() == ARM::t2LoopDec)
+ Decs.push_back(&I);
+ else if (I.getOpcode() == ARM::t2LoopEnd)
+ Ends.push_back(&I);
+ }
+
+ if (Starts.empty() && Decs.empty() && Ends.empty())
+ continue;
+
+ Changed = true;
+
+ for (auto *Start : Starts) {
+ if (Start->getOpcode() == ARM::t2WhileLoopStart)
+ RevertWhile(Start);
+ else
+ Start->eraseFromParent();
+ }
+ for (auto *Dec : Decs)
+ RevertLoopDec(Dec);
+
+ for (auto *End : Ends)
+ RevertLoopEnd(End);
+ }
+ return Changed;
+}
+
FunctionPass *llvm::createARMLowOverheadLoopsPass() {
return new ARMLowOverheadLoops();
}
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 90c5ad025e56..c92689f4942e 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -74,8 +74,8 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
switch (MO.getType()) {
default: llvm_unreachable("unknown operand type");
case MachineOperand::MO_Register:
- // Ignore all non-CPSR implicit register operands.
- if (MO.isImplicit() && MO.getReg() != ARM::CPSR)
+ // Ignore all implicit register operands.
+ if (MO.isImplicit())
return false;
assert(!MO.getSubReg() && "Subregs should be eliminated!");
MCOp = MCOperand::createReg(MO.getReg());
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 90d794cd27b1..bb136e92329b 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -16,6 +16,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/GlobalVariable.h"
#include "llvm/Support/ErrorHandling.h"
#include <utility>
@@ -130,6 +131,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
/// The amount the literal pool has been increasedby due to promoted globals.
int PromotedGlobalsIncrease = 0;
+ /// True if r0 will be preserved by a call to this function (e.g. C++
+ /// con/destructors).
+ bool PreservesR0 = false;
+
public:
ARMFunctionInfo() = default;
@@ -247,6 +252,9 @@ public:
}
DenseMap<unsigned, unsigned> EHPrologueRemappedRegs;
+
+ void setPreservesR0() { PreservesR0 = true; }
+ bool getPreservesR0() const { return PreservesR0; }
};
} // end namespace llvm
diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp
index 5389d09bf7d7..ae5657a0a2c1 100644
--- a/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/lib/Target/ARM/ARMParallelDSP.cpp
@@ -1,4 +1,4 @@
-//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
+//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -18,13 +18,11 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OrderedBasicBlock.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Pass.h"
#include "llvm/PassRegistry.h"
#include "llvm/PassSupport.h"
@@ -45,54 +43,39 @@ static cl::opt<bool>
DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
cl::desc("Disable the ARM Parallel DSP pass"));
+static cl::opt<unsigned>
+NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16),
+ cl::desc("Limit the number of loads analysed"));
+
namespace {
- struct OpChain;
- struct BinOpChain;
+ struct MulCandidate;
class Reduction;
- using OpChainList = SmallVector<std::unique_ptr<OpChain>, 8>;
- using ReductionList = SmallVector<Reduction, 8>;
- using ValueList = SmallVector<Value*, 8>;
- using MemInstList = SmallVector<LoadInst*, 8>;
- using PMACPair = std::pair<BinOpChain*,BinOpChain*>;
- using PMACPairList = SmallVector<PMACPair, 8>;
- using Instructions = SmallVector<Instruction*,16>;
- using MemLocList = SmallVector<MemoryLocation, 4>;
+ using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>;
+ using MemInstList = SmallVectorImpl<LoadInst*>;
+ using MulPairList = SmallVector<std::pair<MulCandidate*, MulCandidate*>, 8>;
- struct OpChain {
+ // 'MulCandidate' holds the multiplication instructions that are candidates
+ // for parallel execution.
+ struct MulCandidate {
Instruction *Root;
- ValueList AllValues;
- MemInstList VecLd; // List of all load instructions.
- MemInstList Loads;
+ Value* LHS;
+ Value* RHS;
+ bool Exchange = false;
bool ReadOnly = true;
+ bool Paired = false;
+ SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen.
- OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
- virtual ~OpChain() = default;
+ MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
+ Root(I), LHS(lhs), RHS(rhs) { }
- void PopulateLoads() {
- for (auto *V : AllValues) {
- if (auto *Ld = dyn_cast<LoadInst>(V))
- Loads.push_back(Ld);
- }
+ bool HasTwoLoadInputs() const {
+ return isa<LoadInst>(LHS) && isa<LoadInst>(RHS);
}
- unsigned size() const { return AllValues.size(); }
- };
-
- // 'BinOpChain' holds the multiplication instructions that are candidates
- // for parallel execution.
- struct BinOpChain : public OpChain {
- ValueList LHS; // List of all (narrow) left hand operands.
- ValueList RHS; // List of all (narrow) right hand operands.
- bool Exchange = false;
-
- BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
- OpChain(I, lhs), LHS(lhs), RHS(rhs) {
- for (auto *V : RHS)
- AllValues.push_back(V);
- }
-
- bool AreSymmetrical(BinOpChain *Other);
+ LoadInst *getBaseLoad() const {
+ return VecLd.front();
+ }
};
/// Represent a sequence of multiply-accumulate operations with the aim to
@@ -100,9 +83,9 @@ namespace {
class Reduction {
Instruction *Root = nullptr;
Value *Acc = nullptr;
- OpChainList Muls;
- PMACPairList MulPairs;
- SmallPtrSet<Instruction*, 4> Adds;
+ MulCandList Muls;
+ MulPairList MulPairs;
+ SetVector<Instruction*> Adds;
public:
Reduction() = delete;
@@ -112,10 +95,35 @@ namespace {
/// Record an Add instruction that is a part of the this reduction.
void InsertAdd(Instruction *I) { Adds.insert(I); }
- /// Record a BinOpChain, rooted at a Mul instruction, that is a part of
- /// this reduction.
- void InsertMul(Instruction *I, ValueList &LHS, ValueList &RHS) {
- Muls.push_back(make_unique<BinOpChain>(I, LHS, RHS));
+ /// Create MulCandidates, each rooted at a Mul instruction, that is a part
+ /// of this reduction.
+ void InsertMuls() {
+ auto GetMulOperand = [](Value *V) -> Instruction* {
+ if (auto *SExt = dyn_cast<SExtInst>(V)) {
+ if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0)))
+ if (I->getOpcode() == Instruction::Mul)
+ return I;
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
+ if (I->getOpcode() == Instruction::Mul)
+ return I;
+ }
+ return nullptr;
+ };
+
+ auto InsertMul = [this](Instruction *I) {
+ Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0);
+ Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0);
+ Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
+ };
+
+ for (auto *Add : Adds) {
+ if (Add == Acc)
+ continue;
+ if (auto *Mul = GetMulOperand(Add->getOperand(0)))
+ InsertMul(Mul);
+ if (auto *Mul = GetMulOperand(Add->getOperand(1)))
+ InsertMul(Mul);
+ }
}
/// Add the incoming accumulator value, returns true if a value had not
@@ -128,9 +136,17 @@ namespace {
return true;
}
- /// Set two BinOpChains, rooted at muls, that can be executed as a single
+ /// Set two MulCandidates, rooted at muls, that can be executed as a single
/// parallel operation.
- void AddMulPair(BinOpChain *Mul0, BinOpChain *Mul1) {
+ void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
+ bool Exchange = false) {
+ LLVM_DEBUG(dbgs() << "Pairing:\n"
+ << *Mul0->Root << "\n"
+ << *Mul1->Root << "\n");
+ Mul0->Paired = true;
+ Mul1->Paired = true;
+ if (Exchange)
+ Mul1->Exchange = true;
MulPairs.push_back(std::make_pair(Mul0, Mul1));
}
@@ -141,24 +157,40 @@ namespace {
/// Return the add instruction which is the root of the reduction.
Instruction *getRoot() { return Root; }
+ bool is64Bit() const { return Root->getType()->isIntegerTy(64); }
+
+ Type *getType() const { return Root->getType(); }
+
/// Return the incoming value to be accumulated. This maybe null.
Value *getAccumulator() { return Acc; }
/// Return the set of adds that comprise the reduction.
- SmallPtrSetImpl<Instruction*> &getAdds() { return Adds; }
+ SetVector<Instruction*> &getAdds() { return Adds; }
- /// Return the BinOpChain, rooted at mul instruction, that comprise the
+ /// Return the MulCandidate, rooted at mul instruction, that comprise the
/// the reduction.
- OpChainList &getMuls() { return Muls; }
+ MulCandList &getMuls() { return Muls; }
- /// Return the BinOpChain, rooted at mul instructions, that have been
+ /// Return the MulCandidate, rooted at mul instructions, that have been
/// paired for parallel execution.
- PMACPairList &getMulPairs() { return MulPairs; }
+ MulPairList &getMulPairs() { return MulPairs; }
/// To finalise, replace the uses of the root with the intrinsic call.
void UpdateRoot(Instruction *SMLAD) {
Root->replaceAllUsesWith(SMLAD);
}
+
+ void dump() {
+ LLVM_DEBUG(dbgs() << "Reduction:\n";
+ for (auto *Add : Adds)
+ LLVM_DEBUG(dbgs() << *Add << "\n");
+ for (auto &Mul : Muls)
+ LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
+ << " " << *Mul->LHS << "\n"
+ << " " << *Mul->RHS << "\n");
+ LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
+ );
+ }
};
class WidenedLoad {
@@ -176,13 +208,11 @@ namespace {
}
};
- class ARMParallelDSP : public LoopPass {
+ class ARMParallelDSP : public FunctionPass {
ScalarEvolution *SE;
AliasAnalysis *AA;
TargetLibraryInfo *TLI;
DominatorTree *DT;
- LoopInfo *LI;
- Loop *L;
const DataLayout *DL;
Module *M;
std::map<LoadInst*, LoadInst*> LoadPairs;
@@ -190,13 +220,12 @@ namespace {
std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
template<unsigned>
- bool IsNarrowSequence(Value *V, ValueList &VL);
-
+ bool IsNarrowSequence(Value *V);
+ bool Search(Value *V, BasicBlock *BB, Reduction &R);
bool RecordMemoryOps(BasicBlock *BB);
void InsertParallelMACs(Reduction &Reduction);
bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
- LoadInst* CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
- IntegerType *LoadTy);
+ LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
bool CreateParallelPairs(Reduction &R);
/// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
@@ -204,60 +233,38 @@ namespace {
/// products to a 32-bit accumulate operand. Optionally, the instruction can
/// exchange the halfwords of the second operand before performing the
/// arithmetic.
- bool MatchSMLAD(Loop *L);
+ bool MatchSMLAD(Function &F);
public:
static char ID;
- ARMParallelDSP() : LoopPass(ID) { }
-
- bool doInitialization(Loop *L, LPPassManager &LPM) override {
- LoadPairs.clear();
- WideLoads.clear();
- return true;
- }
+ ARMParallelDSP() : FunctionPass(ID) { }
void getAnalysisUsage(AnalysisUsage &AU) const override {
- LoopPass::getAnalysisUsage(AU);
+ FunctionPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetPassConfig>();
- AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
AU.setPreservesCFG();
}
- bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
+ bool runOnFunction(Function &F) override {
if (DisableParallelDSP)
return false;
- L = TheLoop;
+ if (skipFunction(F))
+ return false;
+
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &TPC = getAnalysis<TargetPassConfig>();
- BasicBlock *Header = TheLoop->getHeader();
- if (!Header)
- return false;
-
- // TODO: We assume the loop header and latch to be the same block.
- // This is not a fundamental restriction, but lifting this would just
- // require more work to do the transformation and then patch up the CFG.
- if (Header != TheLoop->getLoopLatch()) {
- LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
- "running pass ARMParallelDSP\n");
- return false;
- }
-
- if (!TheLoop->getLoopPreheader())
- InsertPreheaderForLoop(L, DT, LI, nullptr, true);
-
- Function &F = *Header->getParent();
M = F.getParent();
DL = &M->getDataLayout();
@@ -282,17 +289,10 @@ namespace {
return false;
}
- LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
-
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
- if (!RecordMemoryOps(Header)) {
- LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
- return false;
- }
-
- bool Changes = MatchSMLAD(L);
+ bool Changes = MatchSMLAD(F);
return Changes;
}
};
@@ -331,40 +331,14 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
// TODO: we currently only collect i16, and will support i8 later, so that's
// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
template<unsigned MaxBitWidth>
-bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) {
- ConstantInt *CInt;
-
- if (match(V, m_ConstantInt(CInt))) {
- // TODO: if a constant is used, it needs to fit within the bit width.
- return false;
- }
-
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
-
- Value *Val, *LHS, *RHS;
- if (match(V, m_Trunc(m_Value(Val)))) {
- if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
- return IsNarrowSequence<MaxBitWidth>(Val, VL);
- } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
- // TODO: we need to implement sadd16/sadd8 for this, which enables to
- // also do the rewrite for smlad8.ll, but it is unsupported for now.
- return false;
- } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
- if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
+bool ARMParallelDSP::IsNarrowSequence(Value *V) {
+ if (auto *SExt = dyn_cast<SExtInst>(V)) {
+ if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
return false;
- if (match(Val, m_Load(m_Value()))) {
- auto *Ld = cast<LoadInst>(Val);
-
- // Check that these load could be paired.
- if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld))
- return false;
-
- VL.push_back(Val);
- VL.push_back(I);
- return true;
+ if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
+ // Check that this load could be paired.
+ return LoadPairs.count(Ld) || OffsetLoads.count(Ld);
}
}
return false;
@@ -375,6 +349,9 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) {
bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
SmallVector<LoadInst*, 8> Loads;
SmallVector<Instruction*, 8> Writes;
+ LoadPairs.clear();
+ WideLoads.clear();
+ OrderedBasicBlock OrderedBB(BB);
// Collect loads and instruction that may write to memory. For now we only
// record loads which are simple, sign-extended and have a single user.
@@ -389,21 +366,24 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
Loads.push_back(Ld);
}
+ if (Loads.empty() || Loads.size() > NumLoadLimit)
+ return false;
+
using InstSet = std::set<Instruction*>;
using DepMap = std::map<Instruction*, InstSet>;
DepMap RAWDeps;
// Record any writes that may alias a load.
const auto Size = LocationSize::unknown();
- for (auto Read : Loads) {
- for (auto Write : Writes) {
+ for (auto Write : Writes) {
+ for (auto Read : Loads) {
MemoryLocation ReadLoc =
MemoryLocation(Read->getPointerOperand(), Size);
if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
ModRefInfo::ModRef)))
continue;
- if (DT->dominates(Write, Read))
+ if (OrderedBB.dominates(Write, Read))
RAWDeps[Read].insert(Write);
}
}
@@ -411,17 +391,16 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
// Check whether there's not a write between the two loads which would
// prevent them from being safely merged.
auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
- LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset;
- LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base;
+ LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset;
+ LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base;
if (RAWDeps.count(Dominated)) {
InstSet &WritesBefore = RAWDeps[Dominated];
for (auto Before : WritesBefore) {
-
// We can't move the second load backward, past a write, to merge
// with the first load.
- if (DT->dominates(Dominator, Before))
+ if (OrderedBB.dominates(Dominator, Before))
return false;
}
}
@@ -431,7 +410,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
// Record base, offset load pairs.
for (auto *Base : Loads) {
for (auto *Offset : Loads) {
- if (Base == Offset)
+ if (Base == Offset || OffsetLoads.count(Offset))
continue;
if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
@@ -453,7 +432,54 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
return LoadPairs.size() > 1;
}
-// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
+// Search recursively back through the operands to find a tree of values that
+// form a multiply-accumulate chain. The search records the Add and Mul
+// instructions that form the reduction and allows us to find a single value
+// to be used as the initial input to the accumlator.
+bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
+ // If we find a non-instruction, try to use it as the initial accumulator
+ // value. This may have already been found during the search in which case
+ // this function will return false, signaling a search fail.
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return R.InsertAcc(V);
+
+ if (I->getParent() != BB)
+ return false;
+
+ switch (I->getOpcode()) {
+ default:
+ break;
+ case Instruction::PHI:
+ // Could be the accumulator value.
+ return R.InsertAcc(V);
+ case Instruction::Add: {
+ // Adds should be adding together two muls, or another add and a mul to
+ // be within the mac chain. One of the operands may also be the
+ // accumulator value at which point we should stop searching.
+ R.InsertAdd(I);
+ Value *LHS = I->getOperand(0);
+ Value *RHS = I->getOperand(1);
+ bool ValidLHS = Search(LHS, BB, R);
+ bool ValidRHS = Search(RHS, BB, R);
+
+ if (ValidLHS && ValidRHS)
+ return true;
+
+ return R.InsertAcc(I);
+ }
+ case Instruction::Mul: {
+ Value *MulOp0 = I->getOperand(0);
+ Value *MulOp1 = I->getOperand(1);
+ return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
+ }
+ case Instruction::SExt:
+ return Search(I->getOperand(0), BB, R);
+ }
+ return false;
+}
+
+// The pass needs to identify integer add/sub reductions of 16-bit vector
// multiplications.
// To use SMLAD:
// 1) we first need to find integer add then look for this pattern:
@@ -484,88 +510,39 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
// If loop invariants are used instead of loads, these need to be packed
// before the loop begins.
//
-bool ARMParallelDSP::MatchSMLAD(Loop *L) {
- // Search recursively back through the operands to find a tree of values that
- // form a multiply-accumulate chain. The search records the Add and Mul
- // instructions that form the reduction and allows us to find a single value
- // to be used as the initial input to the accumlator.
- std::function<bool(Value*, Reduction&)> Search = [&]
- (Value *V, Reduction &R) -> bool {
-
- // If we find a non-instruction, try to use it as the initial accumulator
- // value. This may have already been found during the search in which case
- // this function will return false, signaling a search fail.
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return R.InsertAcc(V);
-
- switch (I->getOpcode()) {
- default:
- break;
- case Instruction::PHI:
- // Could be the accumulator value.
- return R.InsertAcc(V);
- case Instruction::Add: {
- // Adds should be adding together two muls, or another add and a mul to
- // be within the mac chain. One of the operands may also be the
- // accumulator value at which point we should stop searching.
- bool ValidLHS = Search(I->getOperand(0), R);
- bool ValidRHS = Search(I->getOperand(1), R);
- if (!ValidLHS && !ValidLHS)
- return false;
- else if (ValidLHS && ValidRHS) {
- R.InsertAdd(I);
- return true;
- } else {
- R.InsertAdd(I);
- return R.InsertAcc(I);
- }
- }
- case Instruction::Mul: {
- Value *MulOp0 = I->getOperand(0);
- Value *MulOp1 = I->getOperand(1);
- if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) {
- ValueList LHS;
- ValueList RHS;
- if (IsNarrowSequence<16>(MulOp0, LHS) &&
- IsNarrowSequence<16>(MulOp1, RHS)) {
- R.InsertMul(I, LHS, RHS);
- return true;
- }
- }
- return false;
- }
- case Instruction::SExt:
- return Search(I->getOperand(0), R);
- }
- return false;
- };
-
+bool ARMParallelDSP::MatchSMLAD(Function &F) {
bool Changed = false;
- SmallPtrSet<Instruction*, 4> AllAdds;
- BasicBlock *Latch = L->getLoopLatch();
- for (Instruction &I : reverse(*Latch)) {
- if (I.getOpcode() != Instruction::Add)
+ for (auto &BB : F) {
+ SmallPtrSet<Instruction*, 4> AllAdds;
+ if (!RecordMemoryOps(&BB))
continue;
- if (AllAdds.count(&I))
- continue;
+ for (Instruction &I : reverse(BB)) {
+ if (I.getOpcode() != Instruction::Add)
+ continue;
- const auto *Ty = I.getType();
- if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
- continue;
+ if (AllAdds.count(&I))
+ continue;
- Reduction R(&I);
- if (!Search(&I, R))
- continue;
+ const auto *Ty = I.getType();
+ if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+ continue;
- if (!CreateParallelPairs(R))
- continue;
+ Reduction R(&I);
+ if (!Search(&I, &BB, R))
+ continue;
- InsertParallelMACs(R);
- Changed = true;
- AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
+ R.InsertMuls();
+ LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
+
+ if (!CreateParallelPairs(R))
+ continue;
+
+ InsertParallelMACs(R);
+ Changed = true;
+ AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
+ }
}
return Changed;
@@ -578,87 +555,57 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
return false;
// Check that the muls operate directly upon sign extended loads.
- for (auto &MulChain : R.getMuls()) {
- // A mul has 2 operands, and a narrow op consist of sext and a load; thus
- // we expect at least 4 items in this operand value list.
- if (MulChain->size() < 4) {
- LLVM_DEBUG(dbgs() << "Operand list too short.\n");
+ for (auto &MulCand : R.getMuls()) {
+ if (!MulCand->HasTwoLoadInputs())
return false;
- }
- MulChain->PopulateLoads();
- ValueList &LHS = static_cast<BinOpChain*>(MulChain.get())->LHS;
- ValueList &RHS = static_cast<BinOpChain*>(MulChain.get())->RHS;
-
- // Use +=2 to skip over the expected extend instructions.
- for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
- if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
- return false;
- }
}
- auto CanPair = [&](Reduction &R, BinOpChain *PMul0, BinOpChain *PMul1) {
- if (!PMul0->AreSymmetrical(PMul1))
- return false;
-
+ auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) {
// The first elements of each vector should be loads with sexts. If we
// find that its two pairs of consecutive loads, then these can be
// transformed into two wider loads and the users can be replaced with
// DSP intrinsics.
- for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) {
- auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]);
- auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]);
- auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]);
- auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]);
-
- if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
- return false;
+ auto Ld0 = static_cast<LoadInst*>(PMul0->LHS);
+ auto Ld1 = static_cast<LoadInst*>(PMul1->LHS);
+ auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
+ auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
- LLVM_DEBUG(dbgs() << "Loads:\n"
- << " - " << *Ld0 << "\n"
- << " - " << *Ld1 << "\n"
- << " - " << *Ld2 << "\n"
- << " - " << *Ld3 << "\n");
-
- if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
- if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
- LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
- R.AddMulPair(PMul0, PMul1);
- return true;
- } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
- LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
- LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
- PMul1->Exchange = true;
- R.AddMulPair(PMul0, PMul1);
- return true;
- }
- } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
- AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+ if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
+ if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
- LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
- LLVM_DEBUG(dbgs() << " and swapping muls\n");
- PMul0->Exchange = true;
- // Only the second operand can be exchanged, so swap the muls.
- R.AddMulPair(PMul1, PMul0);
+ R.AddMulPair(PMul0, PMul1);
+ return true;
+ } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
+ LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+ LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
+ R.AddMulPair(PMul0, PMul1, true);
return true;
}
+ } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
+ AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
+ LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
+ LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
+ LLVM_DEBUG(dbgs() << " and swapping muls\n");
+ // Only the second operand can be exchanged, so swap the muls.
+ R.AddMulPair(PMul1, PMul0, true);
+ return true;
}
return false;
};
- OpChainList &Muls = R.getMuls();
+ MulCandList &Muls = R.getMuls();
const unsigned Elems = Muls.size();
- SmallPtrSet<const Instruction*, 4> Paired;
for (unsigned i = 0; i < Elems; ++i) {
- BinOpChain *PMul0 = static_cast<BinOpChain*>(Muls[i].get());
- if (Paired.count(PMul0->Root))
+ MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
+ if (PMul0->Paired)
continue;
for (unsigned j = 0; j < Elems; ++j) {
if (i == j)
continue;
- BinOpChain *PMul1 = static_cast<BinOpChain*>(Muls[j].get());
- if (Paired.count(PMul1->Root))
+ MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
+ if (PMul1->Paired)
continue;
const Instruction *Mul0 = PMul0->Root;
@@ -668,29 +615,19 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
assert(PMul0 != PMul1 && "expected different chains");
- if (CanPair(R, PMul0, PMul1)) {
- Paired.insert(Mul0);
- Paired.insert(Mul1);
+ if (CanPair(R, PMul0, PMul1))
break;
- }
}
}
return !R.getMulPairs().empty();
}
-
void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
- auto CreateSMLADCall = [&](SmallVectorImpl<LoadInst*> &VecLd0,
- SmallVectorImpl<LoadInst*> &VecLd1,
- Value *Acc, bool Exchange,
- Instruction *InsertAfter) {
+ auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
+ Value *Acc, bool Exchange,
+ Instruction *InsertAfter) {
// Replace the reduction chain with an intrinsic call
- IntegerType *Ty = IntegerType::get(M->getContext(), 32);
- LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
- WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty);
- LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
- WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty);
Value* Args[] = { WideLd0, WideLd1, Acc };
Function *SMLAD = nullptr;
@@ -704,34 +641,95 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
+ BasicBlock::iterator(InsertAfter));
Instruction *Call = Builder.CreateCall(SMLAD, Args);
NumSMLAD++;
return Call;
};
- Instruction *InsertAfter = R.getRoot();
+ // Return the instruction after the dominated instruction.
+ auto GetInsertPoint = [this](Value *A, Value *B) {
+ assert((isa<Instruction>(A) || isa<Instruction>(B)) &&
+ "expected at least one instruction");
+
+ Value *V = nullptr;
+ if (!isa<Instruction>(A))
+ V = B;
+ else if (!isa<Instruction>(B))
+ V = A;
+ else
+ V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A;
+
+ return &*++BasicBlock::iterator(cast<Instruction>(V));
+ };
+
Value *Acc = R.getAccumulator();
- if (!Acc)
- Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
- LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n"
- << "Acc: " << *Acc << "\n");
+ // For any muls that were discovered but not paired, accumulate their values
+ // as before.
+ IRBuilder<NoFolder> Builder(R.getRoot()->getParent());
+ MulCandList &MulCands = R.getMuls();
+ for (auto &MulCand : MulCands) {
+ if (MulCand->Paired)
+ continue;
+
+ Instruction *Mul = cast<Instruction>(MulCand->Root);
+ LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
+
+ if (R.getType() != Mul->getType()) {
+ assert(R.is64Bit() && "expected 64-bit result");
+ Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul));
+ Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType()));
+ }
+
+ if (!Acc) {
+ Acc = Mul;
+ continue;
+ }
+
+ // If Acc is the original incoming value to the reduction, it could be a
+ // phi. But the phi will dominate Mul, meaning that Mul will be the
+ // insertion point.
+ Builder.SetInsertPoint(GetInsertPoint(Mul, Acc));
+ Acc = Builder.CreateAdd(Mul, Acc);
+ }
+
+ if (!Acc) {
+ Acc = R.is64Bit() ?
+ ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) :
+ ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
+ } else if (Acc->getType() != R.getType()) {
+ Builder.SetInsertPoint(R.getRoot());
+ Acc = Builder.CreateSExt(Acc, R.getType());
+ }
+
+ // Roughly sort the mul pairs in their program order.
+ OrderedBasicBlock OrderedBB(R.getRoot()->getParent());
+ llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) {
+ const Instruction *A = PairA.first->Root;
+ const Instruction *B = PairB.first->Root;
+ return OrderedBB.dominates(A, B);
+ });
+
+ IntegerType *Ty = IntegerType::get(M->getContext(), 32);
for (auto &Pair : R.getMulPairs()) {
- BinOpChain *PMul0 = Pair.first;
- BinOpChain *PMul1 = Pair.second;
- LLVM_DEBUG(dbgs() << "Muls:\n"
- << "- " << *PMul0->Root << "\n"
- << "- " << *PMul1->Root << "\n");
-
- Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
- InsertAfter);
- InsertAfter = cast<Instruction>(Acc);
+ MulCandidate *LHSMul = Pair.first;
+ MulCandidate *RHSMul = Pair.second;
+ LoadInst *BaseLHS = LHSMul->getBaseLoad();
+ LoadInst *BaseRHS = RHSMul->getBaseLoad();
+ LoadInst *WideLHS = WideLoads.count(BaseLHS) ?
+ WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty);
+ LoadInst *WideRHS = WideLoads.count(BaseRHS) ?
+ WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty);
+
+ Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS);
+ InsertAfter = GetInsertPoint(InsertAfter, Acc);
+ Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
}
R.UpdateRoot(cast<Instruction>(Acc));
}
-LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
IntegerType *LoadTy) {
assert(Loads.size() == 2 && "currently only support widening two loads");
@@ -758,8 +756,8 @@ LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
return;
Source->moveBefore(Sink);
- for (auto &U : Source->uses())
- MoveBefore(Source, U.getUser());
+ for (auto &Op : Source->operands())
+ MoveBefore(Op, Source);
};
// Insert the load at the point of the original dominating load.
@@ -784,57 +782,30 @@ LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
// Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
// TODO: Support big-endian as well.
Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
- BaseSExt->setOperand(0, Bottom);
+ Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
+ BaseSExt->replaceAllUsesWith(NewBaseSExt);
IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
- OffsetSExt->setOperand(0, Trunc);
-
+ Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
+ OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
+
+ LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
+ << *Base << "\n" << *Offset << "\n"
+ << "Created Wide Load:\n"
+ << *WideLoad << "\n"
+ << *Bottom << "\n"
+ << *NewBaseSExt << "\n"
+ << *Top << "\n"
+ << *Trunc << "\n"
+ << *NewOffsetSExt << "\n");
WideLoads.emplace(std::make_pair(Base,
- make_unique<WidenedLoad>(Loads, WideLoad)));
+ std::make_unique<WidenedLoad>(Loads, WideLoad)));
return WideLoad;
}
-// Compare the value lists in Other to this chain.
-bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
- // Element-by-element comparison of Value lists returning true if they are
- // instructions with the same opcode or constants with the same value.
- auto CompareValueList = [](const ValueList &VL0,
- const ValueList &VL1) {
- if (VL0.size() != VL1.size()) {
- LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
- << VL0.size() << " != " << VL1.size() << "\n");
- return false;
- }
-
- const unsigned Pairs = VL0.size();
-
- for (unsigned i = 0; i < Pairs; ++i) {
- const Value *V0 = VL0[i];
- const Value *V1 = VL1[i];
- const auto *Inst0 = dyn_cast<Instruction>(V0);
- const auto *Inst1 = dyn_cast<Instruction>(V1);
-
- if (!Inst0 || !Inst1)
- return false;
-
- if (Inst0->isSameOperationAs(Inst1))
- continue;
-
- const APInt *C0, *C1;
- if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
- return false;
- }
-
- return true;
- };
-
- return CompareValueList(LHS, Other->LHS) &&
- CompareValueList(RHS, Other->RHS);
-}
-
Pass *llvm::createARMParallelDSPPass() {
return new ARMParallelDSP();
}
@@ -842,6 +813,6 @@ Pass *llvm::createARMParallelDSPPass() {
char ARMParallelDSP::ID = 0;
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
- "Transform loops to use DSP intrinsics", false, false)
+ "Transform functions to use DSP intrinsics", false, false)
INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
- "Transform loops to use DSP intrinsics", false, false)
+ "Transform functions to use DSP intrinsics", false, false)
diff --git a/lib/Target/ARM/ARMPredicates.td b/lib/Target/ARM/ARMPredicates.td
index 0b6b40de80dd..b008d3e2e296 100644
--- a/lib/Target/ARM/ARMPredicates.td
+++ b/lib/Target/ARM/ARMPredicates.td
@@ -71,7 +71,7 @@ def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">,
AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
def NoVFP : Predicate<"!Subtarget->hasVFP2Base()">;
def HasVFP2 : Predicate<"Subtarget->hasVFP2Base()">,
- AssemblerPredicate<"FeatureVFP2_D16_SP", "VFP2">;
+ AssemblerPredicate<"FeatureVFP2_SP", "VFP2">;
def HasVFP3 : Predicate<"Subtarget->hasVFP3Base()">,
AssemblerPredicate<"FeatureVFP3_D16_SP", "VFP3">;
def HasVFP4 : Predicate<"Subtarget->hasVFP4Base()">,
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 92ae26b3729d..56055a15483a 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -180,7 +180,7 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
// models the APSR when it's accessed by some special instructions. In such cases
// it has the same encoding as PC.
def CPSR : ARMReg<0, "cpsr">;
-def APSR : ARMReg<1, "apsr">;
+def APSR : ARMReg<15, "apsr">;
def APSR_NZCV : ARMReg<15, "apsr_nzcv">;
def SPSR : ARMReg<2, "spsr">;
def FPSCR : ARMReg<3, "fpscr">;
@@ -486,12 +486,20 @@ def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
// Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP.
// These are needed by instructions (e.g. ldrexd/strexd) requiring even-odd GPRs.
-def Tuples2R : RegisterTuples<[gsub_0, gsub_1],
- [(add R0, R2, R4, R6, R8, R10, R12),
- (add R1, R3, R5, R7, R9, R11, SP)]>;
+def Tuples2Rnosp : RegisterTuples<[gsub_0, gsub_1],
+ [(add R0, R2, R4, R6, R8, R10),
+ (add R1, R3, R5, R7, R9, R11)]>;
+
+def Tuples2Rsp : RegisterTuples<[gsub_0, gsub_1],
+ [(add R12), (add SP)]>;
// Register class representing a pair of even-odd GPRs.
-def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2R)> {
+def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2Rnosp, Tuples2Rsp)> {
+ let Size = 64; // 2 x 32 bits, we have no predefined type of that size.
+}
+
+// Register class representing a pair of even-odd GPRs, except (R12, SP).
+def GPRPairnosp : RegisterClass<"ARM", [untyped], 64, (add Tuples2Rnosp)> {
let Size = 64; // 2 x 32 bits, we have no predefined type of that size.
}
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 21d32bde4710..3f0b71afd977 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -2239,9 +2239,9 @@ def A9WriteLMfpPostRA : SchedWriteVariant<[
// Distinguish between our multiple MI-level forms of the same
// VLDM/VSTM instructions.
def A9PreRA : SchedPredicate<
- "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">;
+ "Register::isVirtualRegister(MI->getOperand(0).getReg())">;
def A9PostRA : SchedPredicate<
- "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">;
+ "Register::isPhysicalRegister(MI->getOperand(0).getReg())">;
// VLDM represents all destination registers as a single register
// tuple, unlike LDM. So the number of write operands is not variadic.
diff --git a/lib/Target/ARM/ARMScheduleM4.td b/lib/Target/ARM/ARMScheduleM4.td
index 38c8ea2b4f35..bfa5fc0d7131 100644
--- a/lib/Target/ARM/ARMScheduleM4.td
+++ b/lib/Target/ARM/ARMScheduleM4.td
@@ -18,6 +18,9 @@ def CortexM4Model : SchedMachineModel {
let PostRAScheduler = 1;
let CompleteModel = 0;
+ let UnsupportedFeatures = [IsARM, HasNEON, HasDotProd, HasZCZ, HasMVEInt,
+ IsNotMClass, HasDPVFP, HasFPARMv8, HasFullFP16, Has8MSecExt, HasV8,
+ HasV8_3a, HasTrustZone, HasDFB, IsWindows];
}
@@ -50,6 +53,7 @@ def : M4UnitL2<WriteMAC16>;
def : M4UnitL2<WriteDIV>;
def : M4UnitL2I<(instregex "(t|t2)LDM")>;
+def : M4UnitL2I<(instregex "(t|t2)LDR")>;
// Stores we use a latency of 1 as they have no outputs
@@ -78,9 +82,20 @@ def : M4UnitL1<WriteNoop>;
def : M4UnitL1<WritePreLd>;
def : M4UnitL1I<(instregex "(t|t2)MOV")>;
def : M4UnitL1I<(instrs COPY)>;
-def : M4UnitL1I<(instregex "t2IT")>;
-def : M4UnitL1I<(instregex "t2SEL", "t2USAD8",
- "t2(S|Q|SH|U|UQ|UH)(ADD16|ASX|SAX|SUB16|ADD8|SUB8)", "t2USADA8", "(t|t2)REV")>;
+def : M4UnitL1I<(instregex "t2IT", "t2MSR", "t2MRS")>;
+def : M4UnitL1I<(instregex "t2CLREX")>;
+def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", "t2SML[AS]",
+ "t2(S|Q|SH|U|UQ|UH|QD)(ADD|ASX|SAX|SUB)", "t2USADA8", "(t|t2)REV")>;
+
+// These instructions are not of much interest to scheduling as they will not
+// be generated or it is not very useful to schedule them. They are here to make
+// the model more complete.
+def : M4UnitL1I<(instregex "t2CDP", "t2LDC", "t2MCR", "t2MRC", "t2MRRC", "t2STC")>;
+def : M4UnitL1I<(instregex "tCPS", "t2ISB", "t2DSB", "t2DMB", "t2?HINT$")>;
+def : M4UnitL1I<(instregex "t2?UDF$", "tBKPT", "t2DBG")>;
+def : M4UnitL1I<(instregex "t?2?Int_eh_sjlj_", "tADDframe", "t?ADJCALL")>;
+def : M4UnitL1I<(instregex "CMP_SWAP", "JUMPTABLE", "MEMCPY")>;
+def : M4UnitL1I<(instregex "VSETLNi32", "VGETLNi32")>;
def : ReadAdvance<ReadALU, 0>;
def : ReadAdvance<ReadALUsr, 0>;
@@ -112,6 +127,9 @@ def : M4UnitL1<WriteVST1>;
def : M4UnitL1<WriteVST2>;
def : M4UnitL1<WriteVST3>;
def : M4UnitL1<WriteVST4>;
+def : M4UnitL1I<(instregex "VMOVS", "FCONSTS", "VCMP", "VNEG", "VABS")>;
+def : M4UnitL2I<(instregex "VMOVD")>;
+def : M4UnitL1I<(instregex "VMRS", "VMSR", "FMSTAT")>;
def : ReadAdvance<ReadFPMUL, 0>;
def : ReadAdvance<ReadFPMAC, 0>;
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 978faed776b0..09603057b2c8 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -125,7 +125,7 @@ const CallLowering *ARMSubtarget::getCallLowering() const {
return CallLoweringInfo.get();
}
-const InstructionSelector *ARMSubtarget::getInstructionSelector() const {
+InstructionSelector *ARMSubtarget::getInstructionSelector() const {
return InstSelector.get();
}
@@ -205,9 +205,9 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
NoARM = true;
if (isAAPCS_ABI())
- stackAlignment = 8;
+ stackAlignment = Align(8);
if (isTargetNaCl() || isAAPCS16_ABI())
- stackAlignment = 16;
+ stackAlignment = Align(16);
// FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
// emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
@@ -253,6 +253,10 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (isRWPI())
ReserveR9 = true;
+ // If MVEVectorCostFactor is still 0 (has not been set to anything else), default it to 2
+ if (MVEVectorCostFactor == 0)
+ MVEVectorCostFactor = 2;
+
// FIXME: Teach TableGen to deal with these instead of doing it manually here.
switch (ARMProcFamily) {
case Others:
@@ -296,13 +300,15 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
LdStMultipleTiming = SingleIssuePlusExtras;
MaxInterleaveFactor = 4;
if (!isThumb())
- PrefLoopAlignment = 3;
+ PrefLoopLogAlignment = 3;
break;
case Kryo:
break;
case Krait:
PreISelOperandLatencyAdjustment = 1;
break;
+ case NeoverseN1:
+ break;
case Swift:
MaxInterleaveFactor = 2;
LdStMultipleTiming = SingleIssuePlusExtras;
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index c2b0f052b843..ef460342a69e 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -71,6 +71,7 @@ protected:
Exynos,
Krait,
Kryo,
+ NeoverseN1,
Swift
};
enum ARMProcClassEnum {
@@ -179,11 +180,9 @@ protected:
bool HasVFPv3SP = false;
bool HasVFPv4SP = false;
bool HasFPARMv8SP = false;
- bool HasVFPv2D16 = false;
bool HasVFPv3D16 = false;
bool HasVFPv4D16 = false;
bool HasFPARMv8D16 = false;
- bool HasVFPv2D16SP = false;
bool HasVFPv3D16SP = false;
bool HasVFPv4D16SP = false;
bool HasFPARMv8D16SP = false;
@@ -450,7 +449,7 @@ protected:
/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
- unsigned stackAlignment = 4;
+ Align stackAlignment = Align(4);
/// CPUString - String name of used CPU.
std::string CPUString;
@@ -469,7 +468,12 @@ protected:
int PreISelOperandLatencyAdjustment = 2;
/// What alignment is preferred for loop bodies, in log2(bytes).
- unsigned PrefLoopAlignment = 0;
+ unsigned PrefLoopLogAlignment = 0;
+
+ /// The cost factor for MVE instructions, representing the multiple beats an
+ // instruction can take. The default is 2, (set in initSubtargetFeatures so
+ // that we can use subtarget features less than 2).
+ unsigned MVEVectorCostFactor = 0;
/// OptMinSize - True if we're optimising for minimum code size, equal to
/// the function attribute.
@@ -535,7 +539,7 @@ public:
}
const CallLowering *getCallLowering() const override;
- const InstructionSelector *getInstructionSelector() const override;
+ InstructionSelector *getInstructionSelector() const override;
const LegalizerInfo *getLegalizerInfo() const override;
const RegisterBankInfo *getRegBankInfo() const override;
@@ -600,7 +604,7 @@ public:
bool hasARMOps() const { return !NoARM; }
- bool hasVFP2Base() const { return HasVFPv2D16SP; }
+ bool hasVFP2Base() const { return HasVFPv2SP; }
bool hasVFP3Base() const { return HasVFPv3D16SP; }
bool hasVFP4Base() const { return HasVFPv4D16SP; }
bool hasFPARMv8Base() const { return HasFPARMv8D16SP; }
@@ -668,6 +672,12 @@ public:
bool hasSB() const { return HasSB; }
bool genLongCalls() const { return GenLongCalls; }
bool genExecuteOnly() const { return GenExecuteOnly; }
+ bool hasBaseDSP() const {
+ if (isThumb())
+ return hasDSP();
+ else
+ return hasV5TEOps();
+ }
bool hasFP16() const { return HasFP16; }
bool hasD32() const { return HasD32; }
@@ -812,7 +822,7 @@ public:
/// getStackAlignment - Returns the minimum alignment known to hold of the
/// stack frame on entry to the function and which must be maintained by every
/// function for this subtarget.
- unsigned getStackAlignment() const { return stackAlignment; }
+ Align getStackAlignment() const { return stackAlignment; }
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
@@ -853,9 +863,9 @@ public:
return isROPI() || !isTargetELF();
}
- unsigned getPrefLoopAlignment() const {
- return PrefLoopAlignment;
- }
+ unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; }
+
+ unsigned getMVEVectorCostFactor() const { return MVEVectorCostFactor; }
bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
unsigned PhysReg) const override;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 7f0aae1739b3..5c8007f101d9 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -96,15 +96,16 @@ extern "C" void LLVMInitializeARMTarget() {
initializeARMExpandPseudoPass(Registry);
initializeThumb2SizeReducePass(Registry);
initializeMVEVPTBlockPass(Registry);
+ initializeMVETailPredicationPass(Registry);
initializeARMLowOverheadLoopsPass(Registry);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
if (TT.isOSBinFormatMachO())
- return llvm::make_unique<TargetLoweringObjectFileMachO>();
+ return std::make_unique<TargetLoweringObjectFileMachO>();
if (TT.isOSWindows())
- return llvm::make_unique<TargetLoweringObjectFileCOFF>();
- return llvm::make_unique<ARMElfTargetObjectFile>();
+ return std::make_unique<TargetLoweringObjectFileCOFF>();
+ return std::make_unique<ARMElfTargetObjectFile>();
}
static ARMBaseTargetMachine::ARMABI
@@ -282,7 +283,7 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle,
+ I = std::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle,
F.hasMinSize());
if (!I->isThumb() && !I->hasARMOps())
@@ -447,8 +448,10 @@ bool ARMPassConfig::addPreISel() {
MergeExternalByDefault));
}
- if (TM->getOptLevel() != CodeGenOpt::None)
+ if (TM->getOptLevel() != CodeGenOpt::None) {
addPass(createHardwareLoopsPass());
+ addPass(createMVETailPredicationPass());
+ }
return false;
}
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 2a8ec734a05f..86c8684d14dc 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -36,8 +36,12 @@ using namespace llvm;
#define DEBUG_TYPE "armtti"
+static cl::opt<bool> EnableMaskedLoadStores(
+ "enable-arm-maskedldst", cl::Hidden, cl::init(false),
+ cl::desc("Enable the generation of masked loads and stores"));
+
static cl::opt<bool> DisableLowOverheadLoops(
- "disable-arm-loloops", cl::Hidden, cl::init(true),
+ "disable-arm-loloops", cl::Hidden, cl::init(false),
cl::desc("Disable the generation of low-overhead loops"));
bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
@@ -167,6 +171,42 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (!SrcTy.isSimple() || !DstTy.isSimple())
return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ // The extend of a load is free
+ if (I && isa<LoadInst>(I->getOperand(0))) {
+ static const TypeConversionCostTblEntry LoadConversionTbl[] = {
+ {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
+ {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
+ {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
+ {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
+ {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
+ {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
+ {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
+ {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
+ {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
+ {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
+ {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
+ {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
+ };
+ if (const auto *Entry = ConvertCostTableLookup(
+ LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+ return Entry->Cost;
+
+ static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
+ {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
+ {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
+ {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
+ {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
+ {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
+ {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
+ };
+ if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
+ if (const auto *Entry =
+ ConvertCostTableLookup(MVELoadConversionTbl, ISD,
+ DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+ return Entry->Cost;
+ }
+ }
+
// Some arithmetic, load and store operations have specific instructions
// to cast up/down their types automatically at no extra cost.
// TODO: Get these tables to know at least what the related operations are.
@@ -313,6 +353,31 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return Entry->Cost;
}
+ // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
+ // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
+ // are linearised so take more.
+ static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
+ };
+
+ if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
+ if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
+ ISD, DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost * ST->getMVEVectorCostFactor();
+ }
+
// Scalar integer conversion costs.
static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
// i16 -> i64 requires two dependent operations.
@@ -332,7 +397,10 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return Entry->Cost;
}
- return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
+ ? ST->getMVEVectorCostFactor()
+ : 1;
+ return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src);
}
int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -343,8 +411,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
return 3;
- if ((Opcode == Instruction::InsertElement ||
- Opcode == Instruction::ExtractElement)) {
+ if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
+ Opcode == Instruction::ExtractElement)) {
// Cross-class copies are expensive on many microarchitectures,
// so assume they are expensive by default.
if (ValTy->getVectorElementType()->isIntegerTy())
@@ -357,6 +425,17 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
}
+ if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
+ Opcode == Instruction::ExtractElement)) {
+ // We say MVE moves costs at least the MVEVectorCostFactor, even though
+ // they are scalar instructions. This helps prevent mixing scalar and
+ // vector, to prevent vectorising where we end up just scalarising the
+ // result anyway.
+ return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
+ ST->getMVEVectorCostFactor()) *
+ ValTy->getVectorNumElements() / 2;
+ }
+
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
}
@@ -385,7 +464,10 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
return LT.first;
}
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+ int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
+ ? ST->getMVEVectorCostFactor()
+ : 1;
+ return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -397,13 +479,37 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
unsigned NumVectorInstToHideOverhead = 10;
int MaxMergeDistance = 64;
- if (Ty->isVectorTy() && SE &&
- !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
- return NumVectorInstToHideOverhead;
+ if (ST->hasNEON()) {
+ if (Ty->isVectorTy() && SE &&
+ !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
+ return NumVectorInstToHideOverhead;
- // In many cases the address computation is not merged into the instruction
- // addressing mode.
- return 1;
+ // In many cases the address computation is not merged into the instruction
+ // addressing mode.
+ return 1;
+ }
+ return BaseT::getAddressComputationCost(Ty, SE, Ptr);
+}
+
+bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
+ if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
+ return false;
+
+ if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
+ // Don't support v2i1 yet.
+ if (VecTy->getNumElements() == 2)
+ return false;
+
+ // We don't support extending fp types.
+ unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
+ if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
+ return false;
+ }
+
+ unsigned EltWidth = DataTy->getScalarSizeInBits();
+ return (EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
+ (EltWidth == 16 && (!Alignment || Alignment >= 2)) ||
+ (EltWidth == 8);
}
int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
@@ -442,78 +548,96 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
- if (Kind == TTI::SK_Broadcast) {
- static const CostTblEntry NEONDupTbl[] = {
- // VDUP handles these cases.
- {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
-
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
-
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
- if (const auto *Entry = CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE,
- LT.second))
- return LT.first * Entry->Cost;
-
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
- }
- if (Kind == TTI::SK_Reverse) {
- static const CostTblEntry NEONShuffleTbl[] = {
- // Reverse shuffle cost one instruction if we are shuffling within a
- // double word (vrev) or two if we shuffle a quad word (vrev, vext).
- {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
-
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
-
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
- if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE,
- LT.second))
- return LT.first * Entry->Cost;
-
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
- }
- if (Kind == TTI::SK_Select) {
- static const CostTblEntry NEONSelShuffleTbl[] = {
- // Select shuffle cost table for ARM. Cost is the number of instructions
- // required to create the shuffled vector.
+ if (ST->hasNEON()) {
+ if (Kind == TTI::SK_Broadcast) {
+ static const CostTblEntry NEONDupTbl[] = {
+ // VDUP handles these cases.
+ {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ if (const auto *Entry =
+ CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+ }
+ if (Kind == TTI::SK_Reverse) {
+ static const CostTblEntry NEONShuffleTbl[] = {
+ // Reverse shuffle cost one instruction if we are shuffling within a
+ // double word (vrev) or two if we shuffle a quad word (vrev, vext).
+ {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ if (const auto *Entry =
+ CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+ }
+ if (Kind == TTI::SK_Select) {
+ static const CostTblEntry NEONSelShuffleTbl[] = {
+ // Select shuffle cost table for ARM. Cost is the number of
+ // instructions
+ // required to create the shuffled vector.
- {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
- if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
- return LT.first * Entry->Cost;
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
+ ISD::VECTOR_SHUFFLE, LT.second))
+ return LT.first * Entry->Cost;
+ }
+ }
+ if (ST->hasMVEIntegerOps()) {
+ if (Kind == TTI::SK_Broadcast) {
+ static const CostTblEntry MVEDupTbl[] = {
+ // VDUP handles these cases.
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
+ LT.second))
+ return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
+ }
}
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
+ ? ST->getMVEVectorCostFactor()
+ : 1;
+ return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
int ARMTTIImpl::getArithmeticInstrCost(
@@ -567,38 +691,64 @@ int ARMTTIImpl::getArithmeticInstrCost(
// Multiplication.
};
- if (ST->hasNEON())
+ if (ST->hasNEON()) {
if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
return LT.first * Entry->Cost;
- int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
- Opd1PropInfo, Opd2PropInfo);
-
- // This is somewhat of a hack. The problem that we are facing is that SROA
- // creates a sequence of shift, and, or instructions to construct values.
- // These sequences are recognized by the ISel and have zero-cost. Not so for
- // the vectorized code. Because we have support for v2i64 but not i64 those
- // sequences look particularly beneficial to vectorize.
- // To work around this we increase the cost of v2i64 operations to make them
- // seem less beneficial.
- if (LT.second == MVT::v2i64 &&
- Op2Info == TargetTransformInfo::OK_UniformConstantValue)
- Cost += 4;
-
- return Cost;
+ int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+ Opd1PropInfo, Opd2PropInfo);
+
+ // This is somewhat of a hack. The problem that we are facing is that SROA
+ // creates a sequence of shift, and, or instructions to construct values.
+ // These sequences are recognized by the ISel and have zero-cost. Not so for
+ // the vectorized code. Because we have support for v2i64 but not i64 those
+ // sequences look particularly beneficial to vectorize.
+ // To work around this we increase the cost of v2i64 operations to make them
+ // seem less beneficial.
+ if (LT.second == MVT::v2i64 &&
+ Op2Info == TargetTransformInfo::OK_UniformConstantValue)
+ Cost += 4;
+
+ return Cost;
+ }
+
+ int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
+ ? ST->getMVEVectorCostFactor()
+ : 1;
+
+ // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
+ // without treating floats as more expensive that scalars or increasing the
+ // costs for custom operations. The results is also multiplied by the
+ // MVEVectorCostFactor where appropriate.
+ if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
+ return LT.first * BaseCost;
+
+ // Else this is expand, assume that we need to scalarize this op.
+ if (Ty->isVectorTy()) {
+ unsigned Num = Ty->getVectorNumElements();
+ unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+ // Return the cost of multiple scalar invocation plus the cost of
+ // inserting and extracting the values.
+ return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost;
+ }
+
+ return BaseCost;
}
int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace, const Instruction *I) {
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
- if (Src->isVectorTy() && Alignment != 16 &&
+ if (ST->hasNEON() && Src->isVectorTy() && Alignment != 16 &&
Src->getVectorElementType()->isDoubleTy()) {
// Unaligned loads/stores are extremely inefficient.
// We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
return LT.first * 4;
}
- return LT.first;
+ int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
+ ? ST->getMVEVectorCostFactor()
+ : 1;
+ return BaseCost * LT.first;
}
int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
@@ -893,6 +1043,11 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
return;
}
+ // Don't unroll vectorised loop. MVE does not benefit from it as much as
+ // scalar code.
+ if (I.getType()->isVectorTy())
+ return;
+
SmallVector<const Value*, 4> Operands(I.value_op_begin(),
I.value_op_end());
Cost += getUserCost(&I, Operands);
@@ -914,3 +1069,28 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
if (Cost < 12)
UP.Force = true;
}
+
+bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const {
+ assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
+ unsigned ScalarBits = Ty->getScalarSizeInBits();
+ if (!ST->hasMVEIntegerOps())
+ return false;
+
+ switch (Opcode) {
+ case Instruction::FAdd:
+ case Instruction::FMul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Mul:
+ case Instruction::FCmp:
+ return false;
+ case Instruction::ICmp:
+ case Instruction::Add:
+ return ScalarBits < 64 && ScalarBits * Ty->getVectorNumElements() == 128;
+ default:
+ llvm_unreachable("Unhandled reduction opcode");
+ }
+ return false;
+}
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 52f6ea4a6e2f..a878fdcfe3c7 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -101,9 +101,9 @@ public:
/// Floating-point computation using ARMv8 AArch32 Advanced
/// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
- /// is IEEE-754 compliant, but it's not covered in this target.
+ /// and Arm MVE are IEEE-754 compliant.
bool isFPVectorizationPotentiallyUnsafe() {
- return !ST->isTargetDarwin();
+ return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
}
/// \name Scalar TTI Implementations
@@ -122,10 +122,13 @@ public:
/// \name Vector TTI Implementations
/// @{
- unsigned getNumberOfRegisters(bool Vector) {
+ unsigned getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (Vector) {
if (ST->hasNEON())
return 16;
+ if (ST->hasMVEIntegerOps())
+ return 8;
return 0;
}
@@ -138,6 +141,8 @@ public:
if (Vector) {
if (ST->hasNEON())
return 128;
+ if (ST->hasMVEIntegerOps())
+ return 128;
return 0;
}
@@ -148,10 +153,23 @@ public:
return ST->getMaxInterleaveFactor();
}
+ bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment);
+
+ bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) {
+ return isLegalMaskedLoad(DataTy, Alignment);
+ }
+
int getMemcpyCost(const Instruction *I);
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+ bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const;
+
+ bool shouldExpandReduction(const IntrinsicInst *II) const {
+ return false;
+ }
+
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
const Instruction *I = nullptr);
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 1da9452f1d22..d2c355c1da75 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -2275,6 +2275,14 @@ public:
return Value >= 1 && Value <= 32;
}
+ bool isMveSaturateOp() const {
+ if (!isImm()) return false;
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ uint64_t Value = CE->getValue();
+ return Value == 48 || Value == 64;
+ }
+
bool isITCondCodeNoAL() const {
if (!isITCondCode()) return false;
ARMCC::CondCodes CC = getCondCode();
@@ -2479,28 +2487,28 @@ public:
void addModImmNotOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
uint32_t Enc = ARM_AM::getSOImmVal(~CE->getValue());
Inst.addOperand(MCOperand::createImm(Enc));
}
void addModImmNegOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
uint32_t Enc = ARM_AM::getSOImmVal(-CE->getValue());
Inst.addOperand(MCOperand::createImm(Enc));
}
void addThumbModImmNeg8_255Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
uint32_t Val = -CE->getValue();
Inst.addOperand(MCOperand::createImm(Val));
}
void addThumbModImmNeg1_7Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
uint32_t Val = -CE->getValue();
Inst.addOperand(MCOperand::createImm(Val));
}
@@ -2523,19 +2531,19 @@ public:
void addFBits16Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(16 - CE->getValue()));
}
void addFBits32Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(32 - CE->getValue()));
}
void addFPImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
Inst.addOperand(MCOperand::createImm(Val));
}
@@ -2544,7 +2552,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// FIXME: We really want to scale the value here, but the LDRD/STRD
// instruction don't encode operands that way yet.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
@@ -2552,35 +2560,31 @@ public:
assert(N == 1 && "Invalid number of operands!");
// FIXME: We really want to scale the value here, but the VSTR/VLDR_VSYSR
// instruction don't encode operands that way yet.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
void addImm7Shift0Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- assert(CE != nullptr && "Invalid operand type!");
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
void addImm7Shift1Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- assert(CE != nullptr && "Invalid operand type!");
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
void addImm7Shift2Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- assert(CE != nullptr && "Invalid operand type!");
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
void addImm7Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- assert(CE != nullptr && "Invalid operand type!");
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
@@ -2588,7 +2592,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The immediate is scaled by four in the encoding and is stored
// in the MCInst as such. Lop off the low two bits here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
}
@@ -2596,7 +2600,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The immediate is scaled by four in the encoding and is stored
// in the MCInst as such. Lop off the low two bits here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(-(CE->getValue() / 4)));
}
@@ -2604,7 +2608,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The immediate is scaled by four in the encoding and is stored
// in the MCInst as such. Lop off the low two bits here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
}
@@ -2612,7 +2616,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The constant encodes as the immediate-1, and we store in the instruction
// the bits as encoded, so subtract off one here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue() - 1));
}
@@ -2620,7 +2624,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The constant encodes as the immediate-1, and we store in the instruction
// the bits as encoded, so subtract off one here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue() - 1));
}
@@ -2628,7 +2632,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The constant encodes as the immediate, except for 32, which encodes as
// zero.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Imm = CE->getValue();
Inst.addOperand(MCOperand::createImm((Imm == 32 ? 0 : Imm)));
}
@@ -2637,7 +2641,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// An ASR value of 32 encodes as 0, so that's how we want to add it to
// the instruction as well.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
int Val = CE->getValue();
Inst.addOperand(MCOperand::createImm(Val == 32 ? 0 : Val));
}
@@ -2646,7 +2650,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The operand is actually a t2_so_imm, but we have its bitwise
// negation in the assembly source, so twiddle it here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(~(uint32_t)CE->getValue()));
}
@@ -2654,7 +2658,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The operand is actually a t2_so_imm, but we have its
// negation in the assembly source, so twiddle it here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue()));
}
@@ -2662,7 +2666,7 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The operand is actually an imm0_4095, but we have its
// negation in the assembly source, so twiddle it here.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue()));
}
@@ -2671,9 +2675,7 @@ public:
Inst.addOperand(MCOperand::createImm(CE->getValue() >> 2));
return;
}
-
- const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
- assert(SR && "Unknown value type!");
+ const MCSymbolRefExpr *SR = cast<MCSymbolRefExpr>(Imm.Val);
Inst.addOperand(MCOperand::createExpr(SR));
}
@@ -2685,10 +2687,7 @@ public:
Inst.addOperand(MCOperand::createImm(CE->getValue()));
return;
}
-
- const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
-
- assert(SR && "Unknown value type!");
+ const MCSymbolRefExpr *SR = cast<MCSymbolRefExpr>(Imm.Val);
Inst.addOperand(MCOperand::createExpr(SR));
return;
}
@@ -2750,7 +2749,7 @@ public:
return;
}
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
int Val = CE->getValue();
Inst.addOperand(MCOperand::createImm(Val));
}
@@ -3130,7 +3129,7 @@ public:
void addPowerTwoOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue()));
}
@@ -3225,14 +3224,14 @@ public:
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
// Mask in that this is an i8 splat.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue() | 0xe00));
}
void addNEONi16splatOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Value = CE->getValue();
Value = ARM_AM::encodeNEONi16splat(Value);
Inst.addOperand(MCOperand::createImm(Value));
@@ -3241,7 +3240,7 @@ public:
void addNEONi16splatNotOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Value = CE->getValue();
Value = ARM_AM::encodeNEONi16splat(~Value & 0xffff);
Inst.addOperand(MCOperand::createImm(Value));
@@ -3250,7 +3249,7 @@ public:
void addNEONi32splatOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Value = CE->getValue();
Value = ARM_AM::encodeNEONi32splat(Value);
Inst.addOperand(MCOperand::createImm(Value));
@@ -3259,7 +3258,7 @@ public:
void addNEONi32splatNotOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Value = CE->getValue();
Value = ARM_AM::encodeNEONi32splat(~Value);
Inst.addOperand(MCOperand::createImm(Value));
@@ -3267,7 +3266,7 @@ public:
void addNEONi8ReplicateOperands(MCInst &Inst, bool Inv) const {
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
Inst.getOpcode() == ARM::VMOVv16i8) &&
"All instructions that wants to replicate non-zero byte "
@@ -3298,7 +3297,7 @@ public:
void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Value = encodeNeonVMOVImmediate(CE->getValue());
Inst.addOperand(MCOperand::createImm(Value));
}
@@ -3310,7 +3309,7 @@ public:
void addNEONvmovi16ReplicateOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
assert((Inst.getOpcode() == ARM::VMOVv4i16 ||
Inst.getOpcode() == ARM::VMOVv8i16 ||
Inst.getOpcode() == ARM::VMVNv4i16 ||
@@ -3327,14 +3326,14 @@ public:
void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
unsigned Value = encodeNeonVMOVImmediate(~CE->getValue());
Inst.addOperand(MCOperand::createImm(Value));
}
void addNEONvmovi32ReplicateOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
assert((Inst.getOpcode() == ARM::VMOVv2i32 ||
Inst.getOpcode() == ARM::VMOVv4i32 ||
Inst.getOpcode() == ARM::VMVNv2i32 ||
@@ -3349,7 +3348,7 @@ public:
void addNEONi64splatOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// The immediate encodes the type of constant as well as the value.
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
uint64_t Value = CE->getValue();
unsigned Imm = 0;
for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
@@ -3360,20 +3359,28 @@ public:
void addComplexRotationEvenOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(CE->getValue() / 90));
}
void addComplexRotationOddOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm((CE->getValue() - 90) / 180));
}
+ void addMveSaturateOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+ unsigned Imm = CE->getValue();
+ assert((Imm == 48 || Imm == 64) && "Invalid saturate operand");
+ Inst.addOperand(MCOperand::createImm(Imm == 48 ? 1 : 0));
+ }
+
void print(raw_ostream &OS) const override;
static std::unique_ptr<ARMOperand> CreateITMask(unsigned Mask, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_ITCondMask);
+ auto Op = std::make_unique<ARMOperand>(k_ITCondMask);
Op->ITMask.Mask = Mask;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3382,7 +3389,7 @@ public:
static std::unique_ptr<ARMOperand> CreateCondCode(ARMCC::CondCodes CC,
SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_CondCode);
+ auto Op = std::make_unique<ARMOperand>(k_CondCode);
Op->CC.Val = CC;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3391,7 +3398,7 @@ public:
static std::unique_ptr<ARMOperand> CreateVPTPred(ARMVCC::VPTCodes CC,
SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_VPTPred);
+ auto Op = std::make_unique<ARMOperand>(k_VPTPred);
Op->VCC.Val = CC;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3399,7 +3406,7 @@ public:
}
static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_CoprocNum);
+ auto Op = std::make_unique<ARMOperand>(k_CoprocNum);
Op->Cop.Val = CopVal;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3407,7 +3414,7 @@ public:
}
static std::unique_ptr<ARMOperand> CreateCoprocReg(unsigned CopVal, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_CoprocReg);
+ auto Op = std::make_unique<ARMOperand>(k_CoprocReg);
Op->Cop.Val = CopVal;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3416,7 +3423,7 @@ public:
static std::unique_ptr<ARMOperand> CreateCoprocOption(unsigned Val, SMLoc S,
SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_CoprocOption);
+ auto Op = std::make_unique<ARMOperand>(k_CoprocOption);
Op->Cop.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -3424,7 +3431,7 @@ public:
}
static std::unique_ptr<ARMOperand> CreateCCOut(unsigned RegNum, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_CCOut);
+ auto Op = std::make_unique<ARMOperand>(k_CCOut);
Op->Reg.RegNum = RegNum;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3432,7 +3439,7 @@ public:
}
static std::unique_ptr<ARMOperand> CreateToken(StringRef Str, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_Token);
+ auto Op = std::make_unique<ARMOperand>(k_Token);
Op->Tok.Data = Str.data();
Op->Tok.Length = Str.size();
Op->StartLoc = S;
@@ -3442,7 +3449,7 @@ public:
static std::unique_ptr<ARMOperand> CreateReg(unsigned RegNum, SMLoc S,
SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_Register);
+ auto Op = std::make_unique<ARMOperand>(k_Register);
Op->Reg.RegNum = RegNum;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -3453,7 +3460,7 @@ public:
CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
unsigned ShiftReg, unsigned ShiftImm, SMLoc S,
SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_ShiftedRegister);
+ auto Op = std::make_unique<ARMOperand>(k_ShiftedRegister);
Op->RegShiftedReg.ShiftTy = ShTy;
Op->RegShiftedReg.SrcReg = SrcReg;
Op->RegShiftedReg.ShiftReg = ShiftReg;
@@ -3466,7 +3473,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
unsigned ShiftImm, SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_ShiftedImmediate);
+ auto Op = std::make_unique<ARMOperand>(k_ShiftedImmediate);
Op->RegShiftedImm.ShiftTy = ShTy;
Op->RegShiftedImm.SrcReg = SrcReg;
Op->RegShiftedImm.ShiftImm = ShiftImm;
@@ -3477,7 +3484,7 @@ public:
static std::unique_ptr<ARMOperand> CreateShifterImm(bool isASR, unsigned Imm,
SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_ShifterImmediate);
+ auto Op = std::make_unique<ARMOperand>(k_ShifterImmediate);
Op->ShifterImm.isASR = isASR;
Op->ShifterImm.Imm = Imm;
Op->StartLoc = S;
@@ -3487,7 +3494,7 @@ public:
static std::unique_ptr<ARMOperand> CreateRotImm(unsigned Imm, SMLoc S,
SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_RotateImmediate);
+ auto Op = std::make_unique<ARMOperand>(k_RotateImmediate);
Op->RotImm.Imm = Imm;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -3496,7 +3503,7 @@ public:
static std::unique_ptr<ARMOperand> CreateModImm(unsigned Bits, unsigned Rot,
SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_ModifiedImmediate);
+ auto Op = std::make_unique<ARMOperand>(k_ModifiedImmediate);
Op->ModImm.Bits = Bits;
Op->ModImm.Rot = Rot;
Op->StartLoc = S;
@@ -3506,7 +3513,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateConstantPoolImm(const MCExpr *Val, SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_ConstantPoolImmediate);
+ auto Op = std::make_unique<ARMOperand>(k_ConstantPoolImmediate);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -3515,7 +3522,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor);
+ auto Op = std::make_unique<ARMOperand>(k_BitfieldDescriptor);
Op->Bitfield.LSB = LSB;
Op->Bitfield.Width = Width;
Op->StartLoc = S;
@@ -3543,16 +3550,15 @@ public:
Kind = k_SPRRegisterList;
}
- // Sort based on the register encoding values.
- array_pod_sort(Regs.begin(), Regs.end());
-
if (Kind == k_RegisterList && Regs.back().second == ARM::APSR)
Kind = k_RegisterListWithAPSR;
- auto Op = make_unique<ARMOperand>(Kind);
- for (SmallVectorImpl<std::pair<unsigned, unsigned>>::const_iterator
- I = Regs.begin(), E = Regs.end(); I != E; ++I)
- Op->Registers.push_back(I->second);
+ assert(std::is_sorted(Regs.begin(), Regs.end()) &&
+ "Register list must be sorted by encoding");
+
+ auto Op = std::make_unique<ARMOperand>(Kind);
+ for (const auto &P : Regs)
+ Op->Registers.push_back(P.second);
Op->StartLoc = StartLoc;
Op->EndLoc = EndLoc;
@@ -3563,7 +3569,7 @@ public:
unsigned Count,
bool isDoubleSpaced,
SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_VectorList);
+ auto Op = std::make_unique<ARMOperand>(k_VectorList);
Op->VectorList.RegNum = RegNum;
Op->VectorList.Count = Count;
Op->VectorList.isDoubleSpaced = isDoubleSpaced;
@@ -3575,7 +3581,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateVectorListAllLanes(unsigned RegNum, unsigned Count, bool isDoubleSpaced,
SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_VectorListAllLanes);
+ auto Op = std::make_unique<ARMOperand>(k_VectorListAllLanes);
Op->VectorList.RegNum = RegNum;
Op->VectorList.Count = Count;
Op->VectorList.isDoubleSpaced = isDoubleSpaced;
@@ -3587,7 +3593,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateVectorListIndexed(unsigned RegNum, unsigned Count, unsigned Index,
bool isDoubleSpaced, SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_VectorListIndexed);
+ auto Op = std::make_unique<ARMOperand>(k_VectorListIndexed);
Op->VectorList.RegNum = RegNum;
Op->VectorList.Count = Count;
Op->VectorList.LaneIndex = Index;
@@ -3599,7 +3605,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
- auto Op = make_unique<ARMOperand>(k_VectorIndex);
+ auto Op = std::make_unique<ARMOperand>(k_VectorIndex);
Op->VectorIndex.Val = Idx;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -3608,7 +3614,7 @@ public:
static std::unique_ptr<ARMOperand> CreateImm(const MCExpr *Val, SMLoc S,
SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_Immediate);
+ auto Op = std::make_unique<ARMOperand>(k_Immediate);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -3620,7 +3626,7 @@ public:
unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType,
unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S,
SMLoc E, SMLoc AlignmentLoc = SMLoc()) {
- auto Op = make_unique<ARMOperand>(k_Memory);
+ auto Op = std::make_unique<ARMOperand>(k_Memory);
Op->Memory.BaseRegNum = BaseRegNum;
Op->Memory.OffsetImm = OffsetImm;
Op->Memory.OffsetRegNum = OffsetRegNum;
@@ -3637,7 +3643,7 @@ public:
static std::unique_ptr<ARMOperand>
CreatePostIdxReg(unsigned RegNum, bool isAdd, ARM_AM::ShiftOpc ShiftTy,
unsigned ShiftImm, SMLoc S, SMLoc E) {
- auto Op = make_unique<ARMOperand>(k_PostIndexRegister);
+ auto Op = std::make_unique<ARMOperand>(k_PostIndexRegister);
Op->PostIdxReg.RegNum = RegNum;
Op->PostIdxReg.isAdd = isAdd;
Op->PostIdxReg.ShiftTy = ShiftTy;
@@ -3649,7 +3655,7 @@ public:
static std::unique_ptr<ARMOperand> CreateMemBarrierOpt(ARM_MB::MemBOpt Opt,
SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_MemBarrierOpt);
+ auto Op = std::make_unique<ARMOperand>(k_MemBarrierOpt);
Op->MBOpt.Val = Opt;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3658,7 +3664,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_InstSyncBarrierOpt);
+ auto Op = std::make_unique<ARMOperand>(k_InstSyncBarrierOpt);
Op->ISBOpt.Val = Opt;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3667,7 +3673,7 @@ public:
static std::unique_ptr<ARMOperand>
CreateTraceSyncBarrierOpt(ARM_TSB::TraceSyncBOpt Opt, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_TraceSyncBarrierOpt);
+ auto Op = std::make_unique<ARMOperand>(k_TraceSyncBarrierOpt);
Op->TSBOpt.Val = Opt;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3676,7 +3682,7 @@ public:
static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags,
SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_ProcIFlags);
+ auto Op = std::make_unique<ARMOperand>(k_ProcIFlags);
Op->IFlags.Val = IFlags;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3684,7 +3690,7 @@ public:
}
static std::unique_ptr<ARMOperand> CreateMSRMask(unsigned MMask, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_MSRMask);
+ auto Op = std::make_unique<ARMOperand>(k_MSRMask);
Op->MMask.Val = MMask;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -3692,7 +3698,7 @@ public:
}
static std::unique_ptr<ARMOperand> CreateBankedReg(unsigned Reg, SMLoc S) {
- auto Op = make_unique<ARMOperand>(k_BankedReg);
+ auto Op = std::make_unique<ARMOperand>(k_BankedReg);
Op->BankedReg.Val = Reg;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -4253,6 +4259,24 @@ static unsigned getNextRegister(unsigned Reg) {
}
}
+// Insert an <Encoding, Register> pair in an ordered vector. Return true on
+// success, or false, if duplicate encoding found.
+static bool
+insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
+ unsigned Enc, unsigned Reg) {
+ Regs.emplace_back(Enc, Reg);
+ for (auto I = Regs.rbegin(), J = I + 1, E = Regs.rend(); J != E; ++I, ++J) {
+ if (J->first == Enc) {
+ Regs.erase(J.base());
+ return false;
+ }
+ if (J->first < Enc)
+ break;
+ std::swap(*I, *J);
+ }
+ return true;
+}
+
/// Parse a register list.
bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
bool EnforceOrder) {
@@ -4278,7 +4302,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
Reg = getDRegFromQReg(Reg);
EReg = MRI->getEncodingValue(Reg);
- Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ Registers.emplace_back(EReg, Reg);
++Reg;
}
const MCRegisterClass *RC;
@@ -4295,7 +4319,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
// Store the register.
EReg = MRI->getEncodingValue(Reg);
- Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ Registers.emplace_back(EReg, Reg);
// This starts immediately after the first register token in the list,
// so we can see either a comma or a minus (range separator) as a legal
@@ -4326,7 +4350,11 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
while (Reg != EndReg) {
Reg = getNextRegister(Reg);
EReg = MRI->getEncodingValue(Reg);
- Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ if (!insertNoDuplicates(Registers, EReg, Reg)) {
+ Warning(AfterMinusLoc, StringRef("duplicated register (") +
+ ARMInstPrinter::getRegisterName(Reg) +
+ ") in register list");
+ }
}
continue;
}
@@ -4350,11 +4378,16 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
// subset of GPRRegClassId except it contains APSR as well.
RC = &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID];
}
- if (Reg == ARM::VPR && (RC == &ARMMCRegisterClasses[ARM::SPRRegClassID] ||
- RC == &ARMMCRegisterClasses[ARM::DPRRegClassID])) {
+ if (Reg == ARM::VPR &&
+ (RC == &ARMMCRegisterClasses[ARM::SPRRegClassID] ||
+ RC == &ARMMCRegisterClasses[ARM::DPRRegClassID] ||
+ RC == &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID])) {
RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID];
EReg = MRI->getEncodingValue(Reg);
- Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ if (!insertNoDuplicates(Registers, EReg, Reg)) {
+ Warning(RegLoc, "duplicated register (" + RegTok.getString() +
+ ") in register list");
+ }
continue;
}
// The register must be in the same register class as the first.
@@ -4371,21 +4404,19 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
else if (!ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg))
return Error(RegLoc, "register list not in ascending order");
}
- if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) {
- Warning(RegLoc, "duplicated register (" + RegTok.getString() +
- ") in register list");
- continue;
- }
// VFP register lists must also be contiguous.
if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
RC != &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID] &&
Reg != OldReg + 1)
return Error(RegLoc, "non-contiguous register range");
EReg = MRI->getEncodingValue(Reg);
- Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ if (!insertNoDuplicates(Registers, EReg, Reg)) {
+ Warning(RegLoc, "duplicated register (" + RegTok.getString() +
+ ") in register list");
+ }
if (isQReg) {
EReg = MRI->getEncodingValue(++Reg);
- Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+ Registers.emplace_back(EReg, Reg);
}
}
@@ -5702,14 +5733,16 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) {
return false;
}
- // If we have a '#', it's an immediate offset, else assume it's a register
- // offset. Be friendly and also accept a plain integer (without a leading
- // hash) for gas compatibility.
+ // If we have a '#' or '$', it's an immediate offset, else assume it's a
+ // register offset. Be friendly and also accept a plain integer or expression
+ // (without a leading hash) for gas compatibility.
if (Parser.getTok().is(AsmToken::Hash) ||
Parser.getTok().is(AsmToken::Dollar) ||
+ Parser.getTok().is(AsmToken::LParen) ||
Parser.getTok().is(AsmToken::Integer)) {
- if (Parser.getTok().isNot(AsmToken::Integer))
- Parser.Lex(); // Eat '#' or '$'.
+ if (Parser.getTok().is(AsmToken::Hash) ||
+ Parser.getTok().is(AsmToken::Dollar))
+ Parser.Lex(); // Eat '#' or '$'
E = Parser.getTok().getLoc();
bool isNegative = getParser().getTok().is(AsmToken::Minus);
@@ -11308,7 +11341,7 @@ bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) {
SmallVector<uint8_t, 16> Opcodes;
auto parseOne = [&]() -> bool {
- const MCExpr *OE;
+ const MCExpr *OE = nullptr;
SMLoc OpcodeLoc = getLexer().getLoc();
if (check(getLexer().is(AsmToken::EndOfStatement) ||
Parser.parseExpression(OE),
@@ -11694,14 +11727,14 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
{ ARM::AEK_CRYPTO, {Feature_HasV8Bit},
{ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} },
{ ARM::AEK_FP, {Feature_HasV8Bit},
- {ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} },
+ {ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8} },
{ (ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM),
{Feature_HasV7Bit, Feature_IsNotMClassBit},
{ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM} },
{ ARM::AEK_MP, {Feature_HasV7Bit, Feature_IsNotMClassBit},
{ARM::FeatureMP} },
{ ARM::AEK_SIMD, {Feature_HasV8Bit},
- {ARM::FeatureNEON, ARM::FeatureVFP2_D16_SP, ARM::FeatureFPARMv8} },
+ {ARM::FeatureNEON, ARM::FeatureVFP2_SP, ARM::FeatureFPARMv8} },
{ ARM::AEK_SEC, {Feature_HasV6KBit}, {ARM::FeatureTrustZone} },
// FIXME: Only available in A-class, isel not predicated
{ ARM::AEK_VIRT, {Feature_HasV7Bit}, {ARM::FeatureVirtualization} },
@@ -11775,19 +11808,19 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
// immediate in the syntax.
switch (Kind) {
default: break;
- case MCK__35_0:
+ case MCK__HASH_0:
if (Op.isImm())
if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
if (CE->getValue() == 0)
return Match_Success;
break;
- case MCK__35_8:
+ case MCK__HASH_8:
if (Op.isImm())
if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
if (CE->getValue() == 8)
return Match_Success;
break;
- case MCK__35_16:
+ case MCK__HASH_16:
if (Op.isImm())
if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
if (CE->getValue() == 16)
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 673691ebd93e..eabc26d05f47 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -314,7 +314,7 @@ static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val,
uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val,
+static DecodeStatus DecodeVMOVModImmInstruction(MCInst &Inst,unsigned Val,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst,unsigned Val,
uint64_t Address, const void *Decoder);
@@ -561,6 +561,8 @@ static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn,
uint64_t Address,
const void *Decoder);
@@ -3445,7 +3447,7 @@ static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
}
static DecodeStatus
-DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn,
+DecodeVMOVModImmInstruction(MCInst &Inst, unsigned Insn,
uint64_t Address, const void *Decoder) {
DecodeStatus S = MCDisassembler::Success;
@@ -5679,7 +5681,7 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
}
}
}
- return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+ return DecodeVMOVModImmInstruction(Inst, Insn, Address, Decoder);
}
if (!(imm & 0x20)) return MCDisassembler::Fail;
@@ -5738,7 +5740,7 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
}
}
}
- return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+ return DecodeVMOVModImmInstruction(Inst, Insn, Address, Decoder);
}
if (!(imm & 0x20)) return MCDisassembler::Fail;
@@ -6481,6 +6483,12 @@ static DecodeStatus DecodeMVEOverlappingLongShift(
if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
return MCDisassembler::Fail;
+ if (fieldFromInstruction (Insn, 6, 3) != 4)
+ return MCDisassembler::SoftFail;
+
+ if (Rda == Rm)
+ return MCDisassembler::SoftFail;
+
return S;
}
@@ -6503,6 +6511,13 @@ static DecodeStatus DecodeMVEOverlappingLongShift(
if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
return MCDisassembler::Fail;
+ if (Inst.getOpcode() == ARM::MVE_SQRSHRL ||
+ Inst.getOpcode() == ARM::MVE_UQRSHLL) {
+ unsigned Saturate = fieldFromInstruction(Insn, 7, 1);
+ // Saturate, the bit position for saturation
+ Inst.addOperand(MCOperand::createImm(Saturate));
+ }
+
return S;
}
@@ -6572,3 +6587,11 @@ static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address,
return MCDisassembler::Fail;
return S;
}
+
+static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn, uint64_t Address,
+ const void *Decoder) {
+ DecodeStatus S = MCDisassembler::Success;
+ Inst.addOperand(MCOperand::createReg(ARM::VPR));
+ Inst.addOperand(MCOperand::createReg(ARM::VPR));
+ return S;
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index 7732a6485a85..24a9fabf0979 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -518,10 +518,10 @@ namespace ARM_AM {
// Valid alignments depend on the specific instruction.
//===--------------------------------------------------------------------===//
- // NEON Modified Immediates
+ // NEON/MVE Modified Immediates
//===--------------------------------------------------------------------===//
//
- // Several NEON instructions (e.g., VMOV) take a "modified immediate"
+ // Several NEON and MVE instructions (e.g., VMOV) take a "modified immediate"
// vector operand, where a small immediate encoded in the instruction
// specifies a full NEON vector value. These modified immediates are
// represented here as encoded integers. The low 8 bits hold the immediate
@@ -529,20 +529,20 @@ namespace ARM_AM {
// the "Cmode" field of the instruction. The interfaces below treat the
// Op and Cmode values as a single 5-bit value.
- inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) {
+ inline unsigned createVMOVModImm(unsigned OpCmode, unsigned Val) {
return (OpCmode << 8) | Val;
}
- inline unsigned getNEONModImmOpCmode(unsigned ModImm) {
+ inline unsigned getVMOVModImmOpCmode(unsigned ModImm) {
return (ModImm >> 8) & 0x1f;
}
- inline unsigned getNEONModImmVal(unsigned ModImm) { return ModImm & 0xff; }
+ inline unsigned getVMOVModImmVal(unsigned ModImm) { return ModImm & 0xff; }
- /// decodeNEONModImm - Decode a NEON modified immediate value into the
+ /// decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the
/// element value and the element size in bits. (If the element size is
/// smaller than the vector, it is splatted into all the elements.)
- inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) {
- unsigned OpCmode = getNEONModImmOpCmode(ModImm);
- unsigned Imm8 = getNEONModImmVal(ModImm);
+ inline uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits) {
+ unsigned OpCmode = getVMOVModImmOpCmode(ModImm);
+ unsigned Imm8 = getVMOVModImmVal(ModImm);
uint64_t Val = 0;
if (OpCmode == 0xe) {
@@ -572,7 +572,7 @@ namespace ARM_AM {
}
EltBits = 64;
} else {
- llvm_unreachable("Unsupported NEON immediate");
+ llvm_unreachable("Unsupported VMOV immediate");
}
return Val;
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index aeab5be78ab4..6196881a9b8f 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -233,7 +233,7 @@ static const char *checkPCRelOffset(uint64_t Value, int64_t Min, int64_t Max) {
const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
uint64_t Value) const {
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
case ARM::fixup_arm_thumb_br: {
// Relaxing tB to t2B. tB has a signed 12-bit displacement with the
// low bit being an implied zero. There's an implied +4 offset for the
@@ -870,7 +870,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
const MCValue &Target) {
const MCSymbolRefExpr *A = Target.getSymA();
const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
- const unsigned FixupKind = Fixup.getKind() ;
+ const unsigned FixupKind = Fixup.getKind();
if (FixupKind == FK_NONE)
return true;
if (FixupKind == ARM::fixup_arm_thumb_bl) {
@@ -1105,28 +1105,28 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
if (Instrs.empty())
return 0;
// Start off assuming CFA is at SP+0.
- int CFARegister = ARM::SP;
+ unsigned CFARegister = ARM::SP;
int CFARegisterOffset = 0;
// Mark savable registers as initially unsaved
DenseMap<unsigned, int> RegOffsets;
int FloatRegCount = 0;
// Process each .cfi directive and build up compact unwind info.
for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
- int Reg;
+ unsigned Reg;
const MCCFIInstruction &Inst = Instrs[i];
switch (Inst.getOperation()) {
case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa
CFARegisterOffset = -Inst.getOffset();
- CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true);
break;
case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset
CFARegisterOffset = -Inst.getOffset();
break;
case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register
- CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true);
break;
case MCCFIInstruction::OpOffset: // DW_CFA_offset
- Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
RegOffsets[Reg] = Inst.getOffset();
else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index c4daafe8ee97..6293a2462306 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -393,6 +393,9 @@ namespace ARMII {
// in an IT block).
ThumbArithFlagSetting = 1 << 19,
+ // Whether an instruction can be included in an MVE tail-predicated loop.
+ ValidForTailPredication = 1 << 20,
+
//===------------------------------------------------------------------===//
// Code domain.
DomainShift = 15,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index fda19eea1de6..1fee38821a49 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -82,7 +82,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
if (IsPCRel) {
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default:
Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
return ELF::R_ARM_NONE;
@@ -145,7 +145,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
return ELF::R_ARM_THM_BF18;
}
}
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default:
Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
return ELF::R_ARM_NONE;
@@ -263,5 +263,5 @@ void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx,
std::unique_ptr<MCObjectTargetWriter>
llvm::createARMELFObjectWriter(uint8_t OSABI) {
- return llvm::make_unique<ARMELFObjectWriter>(OSABI);
+ return std::make_unique<ARMELFObjectWriter>(OSABI);
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index 45be1ee96342..a1def61b58d9 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -1334,12 +1334,12 @@ void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
<< markup(">");
}
-void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+void ARMInstPrinter::printVMOVModImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned EncodedImm = MI->getOperand(OpNum).getImm();
unsigned EltBits;
- uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
+ uint64_t Val = ARM_AM::decodeVMOVModImm(EncodedImm, EltBits);
O << markup("<imm:") << "#0x";
O.write_hex(Val);
O << markup(">");
@@ -1676,3 +1676,11 @@ void ARMInstPrinter::printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
O.write_hex(Val);
O << markup(">");
}
+
+void ARMInstPrinter::printMveSaturateOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint32_t Val = MI->getOperand(OpNum).getImm();
+ assert(Val <= 1 && "Invalid MVE saturate operand");
+ O << "#" << (Val == 1 ? 48 : 64);
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
index 69026956b60e..eeb811e216fc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
@@ -191,7 +191,7 @@ public:
const MCSubtargetInfo &STI, raw_ostream &O);
void printFPImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+ void printVMOVModImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
@@ -262,7 +262,8 @@ public:
const MCSubtargetInfo &STI, raw_ostream &O);
void printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
-
+ void printMveSaturateOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
private:
unsigned DefaultAltIdx = ARM::NoRegAltName;
};
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index dca6fe37d49a..268fe7efd9ce 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1720,7 +1720,6 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
unsigned Reg = MI.getOperand(Op).getReg();
bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg);
bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg);
- bool CLRMRegs = MI.getOpcode() == ARM::t2CLRM;
unsigned Binary = 0;
@@ -1739,21 +1738,13 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
Binary |= NumRegs * 2;
} else {
const MCRegisterInfo &MRI = *CTX.getRegisterInfo();
- if (!CLRMRegs) {
- assert(std::is_sorted(MI.begin() + Op, MI.end(),
- [&](const MCOperand &LHS, const MCOperand &RHS) {
- return MRI.getEncodingValue(LHS.getReg()) <
- MRI.getEncodingValue(RHS.getReg());
- }));
- }
-
+ assert(std::is_sorted(MI.begin() + Op, MI.end(),
+ [&](const MCOperand &LHS, const MCOperand &RHS) {
+ return MRI.getEncodingValue(LHS.getReg()) <
+ MRI.getEncodingValue(RHS.getReg());
+ }));
for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
- unsigned RegNo;
- if (CLRMRegs && MI.getOperand(I).getReg() == ARM::APSR) {
- RegNo = 15;
- } else {
- RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg());
- }
+ unsigned RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg());
Binary |= 1 << RegNo;
}
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index c49885023cb2..ed4000c7e5be 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -204,7 +204,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
// relocation entry in the low 16 bits of r_address field.
unsigned ThumbBit = 0;
unsigned MovtBit = 0;
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default: break;
case ARM::fixup_arm_movt_hi16:
MovtBit = 1;
@@ -480,7 +480,7 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
// PAIR. I.e. it's correct that we insert the high bits of the addend in the
// MOVW case here. relocation entries.
uint32_t Value = 0;
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default: break;
case ARM::fixup_arm_movw_lo16:
case ARM::fixup_t2_movw_lo16:
@@ -506,5 +506,5 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
std::unique_ptr<MCObjectTargetWriter>
llvm::createARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
uint32_t CPUSubtype) {
- return llvm::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
+ return std::make_unique<ARMMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index b863517c0cca..7b30a61e8ccb 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -249,12 +249,12 @@ void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
: ARM::FK_VFPV3_D16)
: (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3XD_FP16
: ARM::FK_VFPV3XD)));
- else if (STI.hasFeature(ARM::FeatureVFP2_D16_SP))
+ else if (STI.hasFeature(ARM::FeatureVFP2_SP))
emitFPU(ARM::FK_VFPV2);
}
// ABI_HardFP_use attribute to indicate single precision FP.
- if (STI.hasFeature(ARM::FeatureVFP2_D16_SP) && !STI.hasFeature(ARM::FeatureFP64))
+ if (STI.hasFeature(ARM::FeatureVFP2_SP) && !STI.hasFeature(ARM::FeatureFP64))
emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
ARMBuildAttrs::HardFPSinglePrecision);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 054a95dd1e12..900c5fe30364 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -92,7 +92,7 @@ namespace llvm {
std::unique_ptr<MCObjectTargetWriter>
createARMWinCOFFObjectWriter(bool Is64Bit) {
- return llvm::make_unique<ARMWinCOFFObjectWriter>(Is64Bit);
+ return std::make_unique<ARMWinCOFFObjectWriter>(Is64Bit);
}
} // end namespace llvm
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index 2e816bea5e91..b3c8146a9bde 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -22,20 +22,10 @@ public:
std::unique_ptr<MCObjectWriter> OW)
: MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
- void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
void EmitThumbFunc(MCSymbol *Symbol) override;
void FinishImpl() override;
};
-void ARMWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
- switch (Flag) {
- default: llvm_unreachable("not implemented");
- case MCAF_SyntaxUnified:
- case MCAF_Code16:
- break;
- }
-}
-
void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) {
getAssembler().setIsThumbFunc(Symbol);
}
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 4b25986b90a7..cc31929899b4 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -86,8 +86,8 @@ void MLxExpansion::pushStack(MachineInstr *MI) {
MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
// Look past COPY and INSERT_SUBREG instructions to find the
// real definition MI. This is important for _sfp instructions.
- unsigned Reg = MI->getOperand(1).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ Register Reg = MI->getOperand(1).getReg();
+ if (Register::isPhysicalRegister(Reg))
return nullptr;
MachineBasicBlock *MBB = MI->getParent();
@@ -97,13 +97,13 @@ MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
break;
if (DefMI->isCopyLike()) {
Reg = DefMI->getOperand(1).getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
DefMI = MRI->getVRegDef(Reg);
continue;
}
} else if (DefMI->isInsertSubreg()) {
Reg = DefMI->getOperand(2).getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
DefMI = MRI->getVRegDef(Reg);
continue;
}
@@ -114,9 +114,8 @@ MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
}
unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
- unsigned Reg = MI->getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
- !MRI->hasOneNonDBGUse(Reg))
+ Register Reg = MI->getOperand(0).getReg();
+ if (Register::isPhysicalRegister(Reg) || !MRI->hasOneNonDBGUse(Reg))
return Reg;
MachineBasicBlock *MBB = MI->getParent();
@@ -126,8 +125,7 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
while (UseMI->isCopy() || UseMI->isInsertSubreg()) {
Reg = UseMI->getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
- !MRI->hasOneNonDBGUse(Reg))
+ if (Register::isPhysicalRegister(Reg) || !MRI->hasOneNonDBGUse(Reg))
return Reg;
UseMI = &*MRI->use_instr_nodbg_begin(Reg);
if (UseMI->getParent() != MBB)
@@ -140,8 +138,8 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
/// hasLoopHazard - Check whether an MLx instruction is chained to itself across
/// a single-MBB loop.
bool MLxExpansion::hasLoopHazard(MachineInstr *MI) const {
- unsigned Reg = MI->getOperand(1).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ Register Reg = MI->getOperand(1).getReg();
+ if (Register::isPhysicalRegister(Reg))
return false;
MachineBasicBlock *MBB = MI->getParent();
@@ -154,8 +152,8 @@ outer_continue:
if (DefMI->isPHI()) {
for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
if (DefMI->getOperand(i + 1).getMBB() == MBB) {
- unsigned SrcReg = DefMI->getOperand(i).getReg();
- if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ Register SrcReg = DefMI->getOperand(i).getReg();
+ if (Register::isVirtualRegister(SrcReg)) {
DefMI = MRI->getVRegDef(SrcReg);
goto outer_continue;
}
@@ -163,13 +161,13 @@ outer_continue:
}
} else if (DefMI->isCopyLike()) {
Reg = DefMI->getOperand(1).getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
DefMI = MRI->getVRegDef(Reg);
continue;
}
} else if (DefMI->isInsertSubreg()) {
Reg = DefMI->getOperand(2).getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
DefMI = MRI->getVRegDef(Reg);
continue;
}
@@ -271,23 +269,23 @@ void
MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
unsigned MulOpc, unsigned AddSubOpc,
bool NegAcc, bool HasLane) {
- unsigned DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
bool DstDead = MI->getOperand(0).isDead();
- unsigned AccReg = MI->getOperand(1).getReg();
- unsigned Src1Reg = MI->getOperand(2).getReg();
- unsigned Src2Reg = MI->getOperand(3).getReg();
+ Register AccReg = MI->getOperand(1).getReg();
+ Register Src1Reg = MI->getOperand(2).getReg();
+ Register Src2Reg = MI->getOperand(3).getReg();
bool Src1Kill = MI->getOperand(2).isKill();
bool Src2Kill = MI->getOperand(3).isKill();
unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0;
unsigned NextOp = HasLane ? 5 : 4;
ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm();
- unsigned PredReg = MI->getOperand(++NextOp).getReg();
+ Register PredReg = MI->getOperand(++NextOp).getReg();
const MCInstrDesc &MCID1 = TII->get(MulOpc);
const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
const MachineFunction &MF = *MI->getParent()->getParent();
- unsigned TmpReg = MRI->createVirtualRegister(
- TII->getRegClass(MCID1, 0, TRI, MF));
+ Register TmpReg =
+ MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI, MF));
MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
.addReg(Src1Reg, getKillRegState(Src1Kill))
diff --git a/lib/Target/ARM/MVETailPredication.cpp b/lib/Target/ARM/MVETailPredication.cpp
new file mode 100644
index 000000000000..4db8ab17c49b
--- /dev/null
+++ b/lib/Target/ARM/MVETailPredication.cpp
@@ -0,0 +1,519 @@
+//===- MVETailPredication.cpp - MVE Tail Predication ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Armv8.1m introduced MVE, M-Profile Vector Extension, and low-overhead
+/// branches to help accelerate DSP applications. These two extensions can be
+/// combined to provide implicit vector predication within a low-overhead loop.
+/// The HardwareLoops pass inserts intrinsics identifying loops that the
+/// backend will attempt to convert into a low-overhead loop. The vectorizer is
+/// responsible for generating a vectorized loop in which the lanes are
+/// predicated upon the iteration counter. This pass looks at these predicated
+/// vector loops, that are targets for low-overhead loops, and prepares it for
+/// code generation. Once the vectorizer has produced a masked loop, there's a
+/// couple of final forms:
+/// - A tail-predicated loop, with implicit predication.
+/// - A loop containing multiple VCPT instructions, predicating multiple VPT
+/// blocks of instructions operating on different vector types.
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "ARM.h"
+#include "ARMSubtarget.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mve-tail-predication"
+#define DESC "Transform predicated vector loops to use MVE tail predication"
+
+static cl::opt<bool>
+DisableTailPredication("disable-mve-tail-predication", cl::Hidden,
+ cl::init(true),
+ cl::desc("Disable MVE Tail Predication"));
+namespace {
+
+class MVETailPredication : public LoopPass {
+ SmallVector<IntrinsicInst*, 4> MaskedInsts;
+ Loop *L = nullptr;
+ ScalarEvolution *SE = nullptr;
+ TargetTransformInfo *TTI = nullptr;
+
+public:
+ static char ID;
+
+ MVETailPredication() : LoopPass(ID) { }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager&) override;
+
+private:
+
+ /// Perform the relevant checks on the loop and convert if possible.
+ bool TryConvert(Value *TripCount);
+
+ /// Return whether this is a vectorized loop, that contains masked
+ /// load/stores.
+ bool IsPredicatedVectorLoop();
+
+ /// Compute a value for the total number of elements that the predicated
+ /// loop will process.
+ Value *ComputeElements(Value *TripCount, VectorType *VecTy);
+
+ /// Is the icmp that generates an i1 vector, based upon a loop counter
+ /// and a limit that is defined outside the loop.
+ bool isTailPredicate(Instruction *Predicate, Value *NumElements);
+};
+
+} // end namespace
+
+static bool IsDecrement(Instruction &I) {
+ auto *Call = dyn_cast<IntrinsicInst>(&I);
+ if (!Call)
+ return false;
+
+ Intrinsic::ID ID = Call->getIntrinsicID();
+ return ID == Intrinsic::loop_decrement_reg;
+}
+
+static bool IsMasked(Instruction *I) {
+ auto *Call = dyn_cast<IntrinsicInst>(I);
+ if (!Call)
+ return false;
+
+ Intrinsic::ID ID = Call->getIntrinsicID();
+ // TODO: Support gather/scatter expand/compress operations.
+ return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load;
+}
+
+bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
+ if (skipLoop(L) || DisableTailPredication)
+ return false;
+
+ Function &F = *L->getHeader()->getParent();
+ auto &TPC = getAnalysis<TargetPassConfig>();
+ auto &TM = TPC.getTM<TargetMachine>();
+ auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ this->L = L;
+
+ // The MVE and LOB extensions are combined to enable tail-predication, but
+ // there's nothing preventing us from generating VCTP instructions for v8.1m.
+ if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) {
+ LLVM_DEBUG(dbgs() << "TP: Not a v8.1m.main+mve target.\n");
+ return false;
+ }
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader)
+ return false;
+
+ auto FindLoopIterations = [](BasicBlock *BB) -> IntrinsicInst* {
+ for (auto &I : *BB) {
+ auto *Call = dyn_cast<IntrinsicInst>(&I);
+ if (!Call)
+ continue;
+
+ Intrinsic::ID ID = Call->getIntrinsicID();
+ if (ID == Intrinsic::set_loop_iterations ||
+ ID == Intrinsic::test_set_loop_iterations)
+ return cast<IntrinsicInst>(&I);
+ }
+ return nullptr;
+ };
+
+ // Look for the hardware loop intrinsic that sets the iteration count.
+ IntrinsicInst *Setup = FindLoopIterations(Preheader);
+
+ // The test.set iteration could live in the pre- preheader.
+ if (!Setup) {
+ if (!Preheader->getSinglePredecessor())
+ return false;
+ Setup = FindLoopIterations(Preheader->getSinglePredecessor());
+ if (!Setup)
+ return false;
+ }
+
+ // Search for the hardware loop intrinic that decrements the loop counter.
+ IntrinsicInst *Decrement = nullptr;
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ if (IsDecrement(I)) {
+ Decrement = cast<IntrinsicInst>(&I);
+ break;
+ }
+ }
+ }
+
+ if (!Decrement)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "TP: Running on Loop: " << *L
+ << *Setup << "\n"
+ << *Decrement << "\n");
+ bool Changed = TryConvert(Setup->getArgOperand(0));
+ return Changed;
+}
+
+bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
+ // Look for the following:
+
+ // %trip.count.minus.1 = add i32 %N, -1
+ // %broadcast.splatinsert10 = insertelement <4 x i32> undef,
+ // i32 %trip.count.minus.1, i32 0
+ // %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10,
+ // <4 x i32> undef,
+ // <4 x i32> zeroinitializer
+ // ...
+ // ...
+ // %index = phi i32
+ // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+ // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert,
+ // <4 x i32> undef,
+ // <4 x i32> zeroinitializer
+ // %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+ // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11
+
+ // And return whether V == %pred.
+
+ using namespace PatternMatch;
+
+ CmpInst::Predicate Pred;
+ Instruction *Shuffle = nullptr;
+ Instruction *Induction = nullptr;
+
+ // The vector icmp
+ if (!match(I, m_ICmp(Pred, m_Instruction(Induction),
+ m_Instruction(Shuffle))) ||
+ Pred != ICmpInst::ICMP_ULE || !L->isLoopInvariant(Shuffle))
+ return false;
+
+ // First find the stuff outside the loop which is setting up the limit
+ // vector....
+ // The invariant shuffle that broadcast the limit into a vector.
+ Instruction *Insert = nullptr;
+ if (!match(Shuffle, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
+ m_Zero())))
+ return false;
+
+ // Insert the limit into a vector.
+ Instruction *BECount = nullptr;
+ if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(BECount),
+ m_Zero())))
+ return false;
+
+ // The limit calculation, backedge count.
+ Value *TripCount = nullptr;
+ if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes())))
+ return false;
+
+ if (TripCount != NumElements)
+ return false;
+
+ // Now back to searching inside the loop body...
+ // Find the add with takes the index iv and adds a constant vector to it.
+ Instruction *BroadcastSplat = nullptr;
+ Constant *Const = nullptr;
+ if (!match(Induction, m_Add(m_Instruction(BroadcastSplat),
+ m_Constant(Const))))
+ return false;
+
+ // Check that we're adding <0, 1, 2, 3...
+ if (auto *CDS = dyn_cast<ConstantDataSequential>(Const)) {
+ for (unsigned i = 0; i < CDS->getNumElements(); ++i) {
+ if (CDS->getElementAsInteger(i) != i)
+ return false;
+ }
+ } else
+ return false;
+
+ // The shuffle which broadcasts the index iv into a vector.
+ if (!match(BroadcastSplat, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
+ m_Zero())))
+ return false;
+
+ // The insert element which initialises a vector with the index iv.
+ Instruction *IV = nullptr;
+ if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(IV), m_Zero())))
+ return false;
+
+ // The index iv.
+ auto *Phi = dyn_cast<PHINode>(IV);
+ if (!Phi)
+ return false;
+
+ // TODO: Don't think we need to check the entry value.
+ Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader());
+ if (!match(OnEntry, m_Zero()))
+ return false;
+
+ Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch());
+ unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements();
+
+ Instruction *LHS = nullptr;
+ if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes))))
+ return false;
+
+ return LHS == Phi;
+}
+
+static VectorType* getVectorType(IntrinsicInst *I) {
+ unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1;
+ auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType());
+ return cast<VectorType>(PtrTy->getElementType());
+}
+
+bool MVETailPredication::IsPredicatedVectorLoop() {
+ // Check that the loop contains at least one masked load/store intrinsic.
+ // We only support 'normal' vector instructions - other than masked
+ // load/stores.
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ if (IsMasked(&I)) {
+ VectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
+ unsigned Lanes = VecTy->getNumElements();
+ unsigned ElementWidth = VecTy->getScalarSizeInBits();
+ // MVE vectors are 128-bit, but don't support 128 x i1.
+ // TODO: Can we support vectors larger than 128-bits?
+ unsigned MaxWidth = TTI->getRegisterBitWidth(true);
+ if (Lanes * ElementWidth != MaxWidth || Lanes == MaxWidth)
+ return false;
+ MaskedInsts.push_back(cast<IntrinsicInst>(&I));
+ } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
+ for (auto &U : Int->args()) {
+ if (isa<VectorType>(U->getType()))
+ return false;
+ }
+ }
+ }
+ }
+
+ return !MaskedInsts.empty();
+}
+
+Value* MVETailPredication::ComputeElements(Value *TripCount,
+ VectorType *VecTy) {
+ const SCEV *TripCountSE = SE->getSCEV(TripCount);
+ ConstantInt *VF = ConstantInt::get(cast<IntegerType>(TripCount->getType()),
+ VecTy->getNumElements());
+
+ if (VF->equalsInt(1))
+ return nullptr;
+
+ // TODO: Support constant trip counts.
+ auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr* {
+ if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
+ if (Const->getAPInt() != -VF->getValue())
+ return nullptr;
+ } else
+ return nullptr;
+ return dyn_cast<SCEVMulExpr>(S->getOperand(1));
+ };
+
+ auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr* {
+ if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
+ if (Const->getValue() != VF)
+ return nullptr;
+ } else
+ return nullptr;
+ return dyn_cast<SCEVUDivExpr>(S->getOperand(1));
+ };
+
+ auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV* {
+ if (auto *Const = dyn_cast<SCEVConstant>(S->getRHS())) {
+ if (Const->getValue() != VF)
+ return nullptr;
+ } else
+ return nullptr;
+
+ if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) {
+ if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) {
+ if (Const->getAPInt() != (VF->getValue() - 1))
+ return nullptr;
+ } else
+ return nullptr;
+
+ return RoundUp->getOperand(1);
+ }
+ return nullptr;
+ };
+
+ // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to
+ // determine the numbers of elements instead? Looks like this is what is used
+ // for delinearization, but I'm not sure if it can be applied to the
+ // vectorized form - at least not without a bit more work than I feel
+ // comfortable with.
+
+ // Search for Elems in the following SCEV:
+ // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))<nuw>) /u VF))<nuw><nsw>
+ const SCEV *Elems = nullptr;
+ if (auto *TC = dyn_cast<SCEVAddExpr>(TripCountSE))
+ if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1)))
+ if (auto *Add = dyn_cast<SCEVAddExpr>(Div->getLHS()))
+ if (auto *Mul = VisitAdd(Add))
+ if (auto *Div = VisitMul(Mul))
+ if (auto *Res = VisitDiv(Div))
+ Elems = Res;
+
+ if (!Elems)
+ return nullptr;
+
+ Instruction *InsertPt = L->getLoopPreheader()->getTerminator();
+ if (!isSafeToExpandAt(Elems, InsertPt, *SE))
+ return nullptr;
+
+ auto DL = L->getHeader()->getModule()->getDataLayout();
+ SCEVExpander Expander(*SE, DL, "elements");
+ return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt);
+}
+
+// Look through the exit block to see whether there's a duplicate predicate
+// instruction. This can happen when we need to perform a select on values
+// from the last and previous iteration. Instead of doing a straight
+// replacement of that predicate with the vctp, clone the vctp and place it
+// in the block. This means that the VPR doesn't have to be live into the
+// exit block which should make it easier to convert this loop into a proper
+// tail predicated loop.
+static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
+ SetVector<Instruction*> &MaybeDead, Loop *L) {
+ if (BasicBlock *Exit = L->getUniqueExitBlock()) {
+ for (auto &Pair : NewPredicates) {
+ Instruction *OldPred = Pair.first;
+ Instruction *NewPred = Pair.second;
+
+ for (auto &I : *Exit) {
+ if (I.isSameOperationAs(OldPred)) {
+ Instruction *PredClone = NewPred->clone();
+ PredClone->insertBefore(&I);
+ I.replaceAllUsesWith(PredClone);
+ MaybeDead.insert(&I);
+ break;
+ }
+ }
+ }
+ }
+
+ // Drop references and add operands to check for dead.
+ SmallPtrSet<Instruction*, 4> Dead;
+ while (!MaybeDead.empty()) {
+ auto *I = MaybeDead.front();
+ MaybeDead.remove(I);
+ if (I->hasNUsesOrMore(1))
+ continue;
+
+ for (auto &U : I->operands()) {
+ if (auto *OpI = dyn_cast<Instruction>(U))
+ MaybeDead.insert(OpI);
+ }
+ I->dropAllReferences();
+ Dead.insert(I);
+ }
+
+ for (auto *I : Dead)
+ I->eraseFromParent();
+
+ for (auto I : L->blocks())
+ DeleteDeadPHIs(I);
+}
+
+bool MVETailPredication::TryConvert(Value *TripCount) {
+ if (!IsPredicatedVectorLoop())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "TP: Found predicated vector loop.\n");
+
+ // Walk through the masked intrinsics and try to find whether the predicate
+ // operand is generated from an induction variable.
+ Module *M = L->getHeader()->getModule();
+ Type *Ty = IntegerType::get(M->getContext(), 32);
+ SetVector<Instruction*> Predicates;
+ DenseMap<Instruction*, Instruction*> NewPredicates;
+
+ for (auto *I : MaskedInsts) {
+ Intrinsic::ID ID = I->getIntrinsicID();
+ unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3;
+ auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp));
+ if (!Predicate || Predicates.count(Predicate))
+ continue;
+
+ VectorType *VecTy = getVectorType(I);
+ Value *NumElements = ComputeElements(TripCount, VecTy);
+ if (!NumElements)
+ continue;
+
+ if (!isTailPredicate(Predicate, NumElements)) {
+ LLVM_DEBUG(dbgs() << "TP: Not tail predicate: " << *Predicate << "\n");
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << "TP: Found tail predicate: " << *Predicate << "\n");
+ Predicates.insert(Predicate);
+
+ // Insert a phi to count the number of elements processed by the loop.
+ IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
+ PHINode *Processed = Builder.CreatePHI(Ty, 2);
+ Processed->addIncoming(NumElements, L->getLoopPreheader());
+
+ // Insert the intrinsic to represent the effect of tail predication.
+ Builder.SetInsertPoint(cast<Instruction>(Predicate));
+ ConstantInt *Factor =
+ ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+ Intrinsic::ID VCTPID;
+ switch (VecTy->getNumElements()) {
+ default:
+ llvm_unreachable("unexpected number of lanes");
+ case 2: VCTPID = Intrinsic::arm_vctp64; break;
+ case 4: VCTPID = Intrinsic::arm_vctp32; break;
+ case 8: VCTPID = Intrinsic::arm_vctp16; break;
+ case 16: VCTPID = Intrinsic::arm_vctp8; break;
+ }
+ Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
+ Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
+ Predicate->replaceAllUsesWith(TailPredicate);
+ NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
+
+ // Add the incoming value to the new phi.
+ // TODO: This add likely already exists in the loop.
+ Value *Remaining = Builder.CreateSub(Processed, Factor);
+ Processed->addIncoming(Remaining, L->getLoopLatch());
+ LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: "
+ << *Processed << "\n"
+ << "TP: Inserted VCTP: " << *TailPredicate << "\n");
+ }
+
+ // Now clean up.
+ Cleanup(NewPredicates, Predicates, L);
+ return true;
+}
+
+Pass *llvm::createMVETailPredicationPass() {
+ return new MVETailPredication();
+}
+
+char MVETailPredication::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MVETailPredication, DEBUG_TYPE, DESC, false, false)
+INITIALIZE_PASS_END(MVETailPredication, DEBUG_TYPE, DESC, false, false)
diff --git a/lib/Target/ARM/MVEVPTBlockPass.cpp b/lib/Target/ARM/MVEVPTBlockPass.cpp
new file mode 100644
index 000000000000..bc0a80b177ed
--- /dev/null
+++ b/lib/Target/ARM/MVEVPTBlockPass.cpp
@@ -0,0 +1,278 @@
+//===-- MVEVPTBlockPass.cpp - Insert MVE VPT blocks -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include <cassert>
+#include <new>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-mve-vpt"
+
+namespace {
+ class MVEVPTBlock : public MachineFunctionPass {
+ public:
+ static char ID;
+ const Thumb2InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+
+ MVEVPTBlock() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+ StringRef getPassName() const override {
+ return "MVE VPT block insertion pass";
+ }
+
+ private:
+ bool InsertVPTBlocks(MachineBasicBlock &MBB);
+ };
+
+ char MVEVPTBlock::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false)
+
+enum VPTMaskValue {
+ T = 8, // 0b1000
+ TT = 4, // 0b0100
+ TE = 12, // 0b1100
+ TTT = 2, // 0b0010
+ TTE = 6, // 0b0110
+ TEE = 10, // 0b1010
+ TET = 14, // 0b1110
+ TTTT = 1, // 0b0001
+ TTTE = 3, // 0b0011
+ TTEE = 5, // 0b0101
+ TTET = 7, // 0b0111
+ TEEE = 9, // 0b1001
+ TEET = 11, // 0b1011
+ TETT = 13, // 0b1101
+ TETE = 15 // 0b1111
+};
+
+static unsigned VCMPOpcodeToVPT(unsigned Opcode) {
+ switch (Opcode) {
+ case ARM::MVE_VCMPf32:
+ return ARM::MVE_VPTv4f32;
+ case ARM::MVE_VCMPf16:
+ return ARM::MVE_VPTv8f16;
+ case ARM::MVE_VCMPi8:
+ return ARM::MVE_VPTv16i8;
+ case ARM::MVE_VCMPi16:
+ return ARM::MVE_VPTv8i16;
+ case ARM::MVE_VCMPi32:
+ return ARM::MVE_VPTv4i32;
+ case ARM::MVE_VCMPu8:
+ return ARM::MVE_VPTv16u8;
+ case ARM::MVE_VCMPu16:
+ return ARM::MVE_VPTv8u16;
+ case ARM::MVE_VCMPu32:
+ return ARM::MVE_VPTv4u32;
+ case ARM::MVE_VCMPs8:
+ return ARM::MVE_VPTv16s8;
+ case ARM::MVE_VCMPs16:
+ return ARM::MVE_VPTv8s16;
+ case ARM::MVE_VCMPs32:
+ return ARM::MVE_VPTv4s32;
+
+ case ARM::MVE_VCMPf32r:
+ return ARM::MVE_VPTv4f32r;
+ case ARM::MVE_VCMPf16r:
+ return ARM::MVE_VPTv8f16r;
+ case ARM::MVE_VCMPi8r:
+ return ARM::MVE_VPTv16i8r;
+ case ARM::MVE_VCMPi16r:
+ return ARM::MVE_VPTv8i16r;
+ case ARM::MVE_VCMPi32r:
+ return ARM::MVE_VPTv4i32r;
+ case ARM::MVE_VCMPu8r:
+ return ARM::MVE_VPTv16u8r;
+ case ARM::MVE_VCMPu16r:
+ return ARM::MVE_VPTv8u16r;
+ case ARM::MVE_VCMPu32r:
+ return ARM::MVE_VPTv4u32r;
+ case ARM::MVE_VCMPs8r:
+ return ARM::MVE_VPTv16s8r;
+ case ARM::MVE_VCMPs16r:
+ return ARM::MVE_VPTv8s16r;
+ case ARM::MVE_VCMPs32r:
+ return ARM::MVE_VPTv4s32r;
+
+ default:
+ return 0;
+ }
+}
+
+static MachineInstr *findVCMPToFoldIntoVPST(MachineBasicBlock::iterator MI,
+ const TargetRegisterInfo *TRI,
+ unsigned &NewOpcode) {
+ // Search backwards to the instruction that defines VPR. This may or not
+ // be a VCMP, we check that after this loop. If we find another instruction
+ // that reads cpsr, we return nullptr.
+ MachineBasicBlock::iterator CmpMI = MI;
+ while (CmpMI != MI->getParent()->begin()) {
+ --CmpMI;
+ if (CmpMI->modifiesRegister(ARM::VPR, TRI))
+ break;
+ if (CmpMI->readsRegister(ARM::VPR, TRI))
+ break;
+ }
+
+ if (CmpMI == MI)
+ return nullptr;
+ NewOpcode = VCMPOpcodeToVPT(CmpMI->getOpcode());
+ if (NewOpcode == 0)
+ return nullptr;
+
+ // Search forward from CmpMI to MI, checking if either register was def'd
+ if (registerDefinedBetween(CmpMI->getOperand(1).getReg(), std::next(CmpMI),
+ MI, TRI))
+ return nullptr;
+ if (registerDefinedBetween(CmpMI->getOperand(2).getReg(), std::next(CmpMI),
+ MI, TRI))
+ return nullptr;
+ return &*CmpMI;
+}
+
+bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
+ bool Modified = false;
+ MachineBasicBlock::instr_iterator MBIter = Block.instr_begin();
+ MachineBasicBlock::instr_iterator EndIter = Block.instr_end();
+
+ while (MBIter != EndIter) {
+ MachineInstr *MI = &*MBIter;
+ unsigned PredReg = 0;
+ DebugLoc dl = MI->getDebugLoc();
+
+ ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg);
+
+ // The idea of the predicate is that None, Then and Else are for use when
+ // handling assembly language: they correspond to the three possible
+ // suffixes "", "t" and "e" on the mnemonic. So when instructions are read
+ // from assembly source or disassembled from object code, you expect to see
+ // a mixture whenever there's a long VPT block. But in code generation, we
+ // hope we'll never generate an Else as input to this pass.
+ assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds");
+
+ if (Pred == ARMVCC::None) {
+ ++MBIter;
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump());
+ int VPTInstCnt = 1;
+ ARMVCC::VPTCodes NextPred;
+
+ // Look at subsequent instructions, checking if they can be in the same VPT
+ // block.
+ ++MBIter;
+ while (MBIter != EndIter && VPTInstCnt < 4) {
+ NextPred = getVPTInstrPredicate(*MBIter, PredReg);
+ assert(NextPred != ARMVCC::Else &&
+ "VPT block pass does not expect Else preds");
+ if (NextPred != Pred)
+ break;
+ LLVM_DEBUG(dbgs() << " adding : "; MBIter->dump());
+ ++VPTInstCnt;
+ ++MBIter;
+ };
+
+ unsigned BlockMask = 0;
+ switch (VPTInstCnt) {
+ case 1:
+ BlockMask = VPTMaskValue::T;
+ break;
+ case 2:
+ BlockMask = VPTMaskValue::TT;
+ break;
+ case 3:
+ BlockMask = VPTMaskValue::TTT;
+ break;
+ case 4:
+ BlockMask = VPTMaskValue::TTTT;
+ break;
+ default:
+ llvm_unreachable("Unexpected number of instruction in a VPT block");
+ };
+
+ // Search back for a VCMP that can be folded to create a VPT, or else create
+ // a VPST directly
+ MachineInstrBuilder MIBuilder;
+ unsigned NewOpcode;
+ MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, TRI, NewOpcode);
+ if (VCMP) {
+ LLVM_DEBUG(dbgs() << " folding VCMP into VPST: "; VCMP->dump());
+ MIBuilder = BuildMI(Block, MI, dl, TII->get(NewOpcode));
+ MIBuilder.addImm(BlockMask);
+ MIBuilder.add(VCMP->getOperand(1));
+ MIBuilder.add(VCMP->getOperand(2));
+ MIBuilder.add(VCMP->getOperand(3));
+ VCMP->eraseFromParent();
+ } else {
+ MIBuilder = BuildMI(Block, MI, dl, TII->get(ARM::MVE_VPST));
+ MIBuilder.addImm(BlockMask);
+ }
+
+ finalizeBundle(
+ Block, MachineBasicBlock::instr_iterator(MIBuilder.getInstr()), MBIter);
+
+ Modified = true;
+ }
+ return Modified;
+}
+
+bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
+ const ARMSubtarget &STI =
+ static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+
+ if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
+ return false;
+
+ TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
+ TRI = STI.getRegisterInfo();
+
+ LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n"
+ << "********** Function: " << Fn.getName() << '\n');
+
+ bool Modified = false;
+ for (MachineBasicBlock &MBB : Fn)
+ Modified |= InsertVPTBlocks(MBB);
+
+ LLVM_DEBUG(dbgs() << "**************************************\n");
+ return Modified;
+}
+
+/// createMVEVPTBlock - Returns an instance of the MVE VPT block
+/// insertion pass.
+FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); }
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 426e9a0ed9b8..956d474f1d79 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -164,7 +164,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
// to determine the end of the prologue.
DebugLoc dl;
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ Register FramePtr = RegInfo->getFrameRegister(MF);
unsigned BasePtr = RegInfo->getBaseRegister();
int CFAOffset = 0;
@@ -459,8 +459,8 @@ static bool isCSRestore(MachineInstr &MI, const MCPhysReg *CSRegs) {
else if (MI.getOpcode() == ARM::tPOP) {
return true;
} else if (MI.getOpcode() == ARM::tMOVr) {
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
return ((ARM::tGPRRegClass.contains(Src) || Src == ARM::LR) &&
ARM::hGPRRegClass.contains(Dst));
}
@@ -483,7 +483,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
assert((unsigned)NumBytes >= ArgRegsSaveSize &&
"ArgRegsSaveSize is included in NumBytes");
const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
+ Register FramePtr = RegInfo->getFrameRegister(MF);
if (!AFI->hasStackFrame()) {
if (NumBytes - ArgRegsSaveSize != 0)
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index f57d93a2e83d..fccaa4c9cc8a 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -80,12 +80,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
assert((RC == &ARM::tGPRRegClass ||
- (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
- isARMLowRegister(SrcReg))) && "Unknown regclass!");
+ (Register::isPhysicalRegister(SrcReg) && isARMLowRegister(SrcReg))) &&
+ "Unknown regclass!");
if (RC == &ARM::tGPRRegClass ||
- (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
- isARMLowRegister(SrcReg))) {
+ (Register::isPhysicalRegister(SrcReg) && isARMLowRegister(SrcReg))) {
DebugLoc DL;
if (I != MBB.end()) DL = I->getDebugLoc();
@@ -108,13 +107,13 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
unsigned DestReg, int FI,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
- assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
- (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
- isARMLowRegister(DestReg))) && "Unknown regclass!");
+ assert(
+ (RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
+ (Register::isPhysicalRegister(DestReg) && isARMLowRegister(DestReg))) &&
+ "Unknown regclass!");
if (RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
- (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
- isARMLowRegister(DestReg))) {
+ (Register::isPhysicalRegister(DestReg) && isARMLowRegister(DestReg))) {
DebugLoc DL;
if (I != MBB.end()) DL = I->getDebugLoc();
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 3143eb9840ed..786fc78d0233 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -87,7 +87,7 @@ static void TrackDefUses(MachineInstr *MI, RegisterSet &Defs, RegisterSet &Uses,
for (auto &MO : MI->operands()) {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP)
continue;
if (MO.isUse())
@@ -145,8 +145,8 @@ Thumb2ITBlock::MoveCopyOutOfITBlock(MachineInstr *MI,
MI->getOperand(1).getSubReg() == 0 &&
"Sub-register indices still around?");
- unsigned DstReg = MI->getOperand(0).getReg();
- unsigned SrcReg = MI->getOperand(1).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
+ Register SrcReg = MI->getOperand(1).getReg();
// First check if it's safe to move it.
if (Uses.count(DstReg) || Defs.count(SrcReg))
@@ -308,131 +308,3 @@ bool Thumb2ITBlock::runOnMachineFunction(MachineFunction &Fn) {
/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks
/// insertion pass.
FunctionPass *llvm::createThumb2ITBlockPass() { return new Thumb2ITBlock(); }
-
-#undef DEBUG_TYPE
-#define DEBUG_TYPE "arm-mve-vpt"
-
-namespace {
- class MVEVPTBlock : public MachineFunctionPass {
- public:
- static char ID;
- const Thumb2InstrInfo *TII;
- const TargetRegisterInfo *TRI;
-
- MVEVPTBlock() : MachineFunctionPass(ID) {}
-
- bool runOnMachineFunction(MachineFunction &Fn) override;
-
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoVRegs);
- }
-
- StringRef getPassName() const override {
- return "MVE VPT block insertion pass";
- }
-
- private:
- bool InsertVPTBlocks(MachineBasicBlock &MBB);
- };
-
- char MVEVPTBlock::ID = 0;
-
-} // end anonymous namespace
-
-INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false)
-
-enum VPTMaskValue {
- T = 8, // 0b1000
- TT = 4, // 0b0100
- TE = 12, // 0b1100
- TTT = 2, // 0b0010
- TTE = 6, // 0b0110
- TEE = 10, // 0b1010
- TET = 14, // 0b1110
- TTTT = 1, // 0b0001
- TTTE = 3, // 0b0011
- TTEE = 5, // 0b0101
- TTET = 7, // 0b0111
- TEEE = 9, // 0b1001
- TEET = 11, // 0b1011
- TETT = 13, // 0b1101
- TETE = 15 // 0b1111
-};
-
-bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
- bool Modified = false;
- MachineBasicBlock::iterator MBIter = Block.begin();
- MachineBasicBlock::iterator EndIter = Block.end();
-
- while (MBIter != EndIter) {
- MachineInstr *MI = &*MBIter;
- unsigned PredReg = 0;
- DebugLoc dl = MI->getDebugLoc();
-
- ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg);
-
- // The idea of the predicate is that None, Then and Else are for use when
- // handling assembly language: they correspond to the three possible
- // suffixes "", "t" and "e" on the mnemonic. So when instructions are read
- // from assembly source or disassembled from object code, you expect to see
- // a mixture whenever there's a long VPT block. But in code generation, we
- // hope we'll never generate an Else as input to this pass.
-
- assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds");
-
- if (Pred == ARMVCC::None) {
- ++MBIter;
- continue;
- }
-
- MachineInstrBuilder MIBuilder =
- BuildMI(Block, MBIter, dl, TII->get(ARM::MVE_VPST));
- // The mask value for the VPST instruction is T = 0b1000 = 8
- MIBuilder.addImm(VPTMaskValue::T);
-
- MachineBasicBlock::iterator VPSTInsertPos = MIBuilder.getInstr();
- int VPTInstCnt = 1;
- ARMVCC::VPTCodes NextPred;
-
- do {
- ++MBIter;
- NextPred = getVPTInstrPredicate(*MBIter, PredReg);
- } while (NextPred != ARMVCC::None && NextPred == Pred && ++VPTInstCnt < 4);
-
- MachineInstr *LastMI = &*MBIter;
- finalizeBundle(Block, VPSTInsertPos.getInstrIterator(),
- ++LastMI->getIterator());
-
- Modified = true;
- LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump(););
-
- ++MBIter;
- }
- return Modified;
-}
-
-bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
- const ARMSubtarget &STI =
- static_cast<const ARMSubtarget &>(Fn.getSubtarget());
-
- if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
- return false;
-
- TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
- TRI = STI.getRegisterInfo();
-
- LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n"
- << "********** Function: " << Fn.getName() << '\n');
-
- bool Modified = false;
- for (MachineBasicBlock &MBB : Fn)
- Modified |= InsertVPTBlocks(MBB);
-
- LLVM_DEBUG(dbgs() << "**************************************\n");
- return Modified;
-}
-
-/// createMVEVPTBlock - Returns an instance of the MVE VPT block
-/// insertion pass.
-FunctionPass *llvm::createMVEVPTBlockPass() { return new MVEVPTBlock(); }
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 5a965f7a6b9b..af1f0aeb27ba 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -159,9 +159,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
// Thumb2 STRD expects its dest-registers to be in rGPR. Not a problem for
// gsub_0, but needs an extra constraint for gsub_1 (which could be sp
// otherwise).
- if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ if (Register::isVirtualRegister(SrcReg)) {
MachineRegisterInfo *MRI = &MF.getRegInfo();
- MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass);
+ MRI->constrainRegClass(SrcReg, &ARM::GPRPairnospRegClass);
}
MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
@@ -200,10 +200,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
// Thumb2 LDRD expects its dest-registers to be in rGPR. Not a problem for
// gsub_0, but needs an extra constraint for gsub_1 (which could be sp
// otherwise).
- if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+ if (Register::isVirtualRegister(DestReg)) {
MachineRegisterInfo *MRI = &MF.getRegInfo();
- MRI->constrainRegClass(DestReg,
- &ARM::GPRPair_with_gsub_1_in_GPRwithAPSRnospRegClass);
+ MRI->constrainRegClass(DestReg, &ARM::GPRPairnospRegClass);
}
MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
@@ -211,7 +210,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
- if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ if (Register::isPhysicalRegister(DestReg))
MIB.addReg(DestReg, RegState::ImplicitDefine);
return;
}
@@ -470,12 +469,17 @@ immediateOffsetOpcode(unsigned opcode)
bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
unsigned FrameReg, int &Offset,
- const ARMBaseInstrInfo &TII) {
+ const ARMBaseInstrInfo &TII,
+ const TargetRegisterInfo *TRI) {
unsigned Opcode = MI.getOpcode();
const MCInstrDesc &Desc = MI.getDesc();
unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
bool isSub = false;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const TargetRegisterClass *RegClass =
+ TII.getRegClass(Desc, FrameRegIdx, TRI, MF);
+
// Memory operands in inline assembly always use AddrModeT2_i12.
if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR)
AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2?
@@ -554,7 +558,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
// register then we change to an immediate version.
unsigned NewOpc = Opcode;
if (AddrMode == ARMII::AddrModeT2_so) {
- unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg();
+ Register OffsetReg = MI.getOperand(FrameRegIdx + 1).getReg();
if (OffsetReg != 0) {
MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
return Offset == 0;
@@ -645,10 +649,21 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1);
// Attempt to fold address computation
- // Common case: small offset, fits into instruction.
+ // Common case: small offset, fits into instruction. We need to make sure
+ // the register class is correct too, for instructions like the MVE
+ // VLDRH.32, which only accepts low tGPR registers.
int ImmedOffset = Offset / Scale;
unsigned Mask = (1 << NumBits) - 1;
- if ((unsigned)Offset <= Mask * Scale) {
+ if ((unsigned)Offset <= Mask * Scale &&
+ (Register::isVirtualRegister(FrameReg) ||
+ RegClass->contains(FrameReg))) {
+ if (Register::isVirtualRegister(FrameReg)) {
+ // Make sure the register class for the virtual register is correct
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+ if (!MRI->constrainRegClass(FrameReg, RegClass))
+ llvm_unreachable("Unable to constrain virtual register class.");
+ }
+
// Replace the FrameIndex with fp/sp
MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
if (isSub) {
@@ -681,7 +696,8 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
}
Offset = (isSub) ? -Offset : Offset;
- return Offset == 0;
+ return Offset == 0 && (Register::isVirtualRegister(FrameReg) ||
+ RegClass->contains(FrameReg));
}
ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI,
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 37a85fa38417..c5a62aa33990 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -300,7 +300,7 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
for (const MachineOperand &MO : CPSRDef->operands()) {
if (!MO.isReg() || MO.isUndef() || MO.isUse())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg == 0 || Reg == ARM::CPSR)
continue;
Defs.insert(Reg);
@@ -309,7 +309,7 @@ Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
for (const MachineOperand &MO : Use->operands()) {
if (!MO.isReg() || MO.isUndef() || MO.isDef())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Defs.count(Reg))
return false;
}
@@ -380,7 +380,7 @@ static bool VerifyLowRegs(MachineInstr *MI) {
const MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg() || MO.isImplicit())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg == 0 || Reg == ARM::CPSR)
continue;
if (isPCOk && Reg == ARM::PC)
@@ -464,11 +464,11 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
// For this reason we can't reuse the logic at the end of this function; we
// have to implement the MI building here.
bool IsStore = Entry.WideOpc == ARM::t2STR_POST;
- unsigned Rt = MI->getOperand(IsStore ? 1 : 0).getReg();
- unsigned Rn = MI->getOperand(IsStore ? 0 : 1).getReg();
+ Register Rt = MI->getOperand(IsStore ? 1 : 0).getReg();
+ Register Rn = MI->getOperand(IsStore ? 0 : 1).getReg();
unsigned Offset = MI->getOperand(3).getImm();
unsigned PredImm = MI->getOperand(4).getImm();
- unsigned PredReg = MI->getOperand(5).getReg();
+ Register PredReg = MI->getOperand(5).getReg();
assert(isARMLowRegister(Rt));
assert(isARMLowRegister(Rn));
@@ -496,7 +496,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
return true;
}
case ARM::t2LDMIA: {
- unsigned BaseReg = MI->getOperand(0).getReg();
+ Register BaseReg = MI->getOperand(0).getReg();
assert(isARMLowRegister(BaseReg));
// For the non-writeback version (this one), the base register must be
@@ -524,7 +524,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
break;
case ARM::t2LDMIA_RET: {
- unsigned BaseReg = MI->getOperand(1).getReg();
+ Register BaseReg = MI->getOperand(1).getReg();
if (BaseReg != ARM::SP)
return false;
Opc = Entry.NarrowOpc2; // tPOP_RET
@@ -537,7 +537,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
case ARM::t2STMDB_UPD: {
OpNum = 0;
- unsigned BaseReg = MI->getOperand(1).getReg();
+ Register BaseReg = MI->getOperand(1).getReg();
if (BaseReg == ARM::SP &&
(Entry.WideOpc == ARM::t2LDMIA_UPD ||
Entry.WideOpc == ARM::t2STMDB_UPD)) {
@@ -743,11 +743,11 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
// are optimizing for size.
return false;
- unsigned Reg0 = MI->getOperand(0).getReg();
- unsigned Reg1 = MI->getOperand(1).getReg();
+ Register Reg0 = MI->getOperand(0).getReg();
+ Register Reg1 = MI->getOperand(1).getReg();
// t2MUL is "special". The tied source operand is second, not first.
if (MI->getOpcode() == ARM::t2MUL) {
- unsigned Reg2 = MI->getOperand(2).getReg();
+ Register Reg2 = MI->getOperand(2).getReg();
// Early exit if the regs aren't all low regs.
if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1)
|| !isARMLowRegister(Reg2))
@@ -782,7 +782,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
if (Imm > Limit)
return false;
} else {
- unsigned Reg2 = MI->getOperand(2).getReg();
+ Register Reg2 = MI->getOperand(2).getReg();
if (Entry.LowRegs2 && !isARMLowRegister(Reg2))
return false;
}
@@ -868,7 +868,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
continue;
const MachineOperand &MO = MI->getOperand(i);
if (MO.isReg()) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (!Reg || Reg == ARM::CPSR)
continue;
if (Entry.LowRegs1 && !isARMLowRegister(Reg))
diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp
index a96417ffce4d..b0ba58d8dc4a 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -107,8 +107,9 @@ void ThumbRegisterInfo::emitLoadConstPool(
MachineFunction &MF = *MBB.getParent();
const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
if (STI.isThumb1Only()) {
- assert((isARMLowRegister(DestReg) || isVirtualRegister(DestReg)) &&
- "Thumb1 does not have ldr to high register");
+ assert(
+ (isARMLowRegister(DestReg) || Register::isVirtualRegister(DestReg)) &&
+ "Thumb1 does not have ldr to high register");
return emitThumb1LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred,
PredReg, MIFlags);
}
@@ -141,7 +142,7 @@ static void emitThumbRegPlusImmInReg(
unsigned LdReg = DestReg;
if (DestReg == ARM::SP)
assert(BaseReg == ARM::SP && "Unexpected!");
- if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg))
+ if (!isARMLowRegister(DestReg) && !Register::isVirtualRegister(DestReg))
LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
@@ -371,7 +372,7 @@ bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
if (Opcode == ARM::tADDframe) {
Offset += MI.getOperand(FrameRegIdx+1).getImm();
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII,
*this);
@@ -509,7 +510,7 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
if (MI.mayLoad()) {
// Use the destination register to materialize sp + offset.
- unsigned TmpReg = MI.getOperand(0).getReg();
+ Register TmpReg = MI.getOperand(0).getReg();
bool UseRR = false;
if (Opcode == ARM::tLDRspi) {
if (FrameReg == ARM::SP || STI.genExecuteOnly())
diff --git a/lib/Target/AVR/AVRAsmPrinter.cpp b/lib/Target/AVR/AVRAsmPrinter.cpp
index 7586bd7b78fc..1db6b2236b4f 100644
--- a/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -97,7 +97,7 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
assert(RegOp.isReg() && "Operand must be a register when you're"
"using 'A'..'Z' operand extracodes.");
- unsigned Reg = RegOp.getReg();
+ Register Reg = RegOp.getReg();
unsigned ByteNumber = ExtraCode[0] - 'A';
diff --git a/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index c45b2d0e39c1..83d0f6845332 100644
--- a/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -140,8 +140,8 @@ bool AVRExpandPseudo::
expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI) {
MachineInstr &MI = *MBBI;
unsigned SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(2).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(2).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
bool DstIsKill = MI.getOperand(1).isKill();
bool SrcIsKill = MI.getOperand(2).isKill();
@@ -173,8 +173,8 @@ bool AVRExpandPseudo::
expandLogic(unsigned Op, Block &MBB, BlockIt MBBI) {
MachineInstr &MI = *MBBI;
unsigned SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(2).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(2).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
bool DstIsKill = MI.getOperand(1).isKill();
bool SrcIsKill = MI.getOperand(2).isKill();
@@ -220,7 +220,7 @@ bool AVRExpandPseudo::
expandLogicImm(unsigned Op, Block &MBB, BlockIt MBBI) {
MachineInstr &MI = *MBBI;
unsigned DstLoReg, DstHiReg;
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
bool SrcIsKill = MI.getOperand(1).isKill();
bool ImpIsDead = MI.getOperand(3).isDead();
@@ -874,7 +874,7 @@ unsigned AVRExpandPseudo::scavengeGPR8(MachineInstr &MI) {
// Exclude all the registers being used by the instruction.
for (MachineOperand &MO : MI.operands()) {
if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
- !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ !Register::isVirtualRegister(MO.getReg()))
Candidates.reset(MO.getReg());
}
diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp
index 5e91bb8632c1..e6c48de5a782 100644
--- a/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/lib/Target/AVR/AVRFrameLowering.cpp
@@ -30,7 +30,8 @@
namespace llvm {
AVRFrameLowering::AVRFrameLowering()
- : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 1, -2) {}
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align::None(),
+ -2) {}
bool AVRFrameLowering::canSimplifyCallFramePseudos(
const MachineFunction &MF) const {
@@ -323,7 +324,7 @@ static void fixStackStores(MachineBasicBlock &MBB,
"Invalid register, should be SP!");
if (insertPushes) {
// Replace this instruction with a push.
- unsigned SrcReg = MI.getOperand(2).getReg();
+ Register SrcReg = MI.getOperand(2).getReg();
bool SrcIsKill = MI.getOperand(2).isKill();
// We can't use PUSHWRr here because when expanded the order of the new
diff --git a/lib/Target/AVR/AVRISelDAGToDAG.cpp b/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 5cb4441c4380..4c4f4faa0508 100644
--- a/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -251,7 +251,7 @@ bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
RegisterSDNode *RegNode =
cast<RegisterSDNode>(CopyFromRegOp->getOperand(1));
Reg = RegNode->getReg();
- CanHandleRegImmOpt &= (TargetRegisterInfo::isVirtualRegister(Reg) ||
+ CanHandleRegImmOpt &= (Register::isVirtualRegister(Reg) ||
AVR::PTRDISPREGSRegClass.contains(Reg));
} else {
CanHandleRegImmOpt = false;
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index b6ba5f22fafb..f12c59b7d8c3 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -236,7 +236,7 @@ AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
setLibcallName(RTLIB::SIN_F32, "sin");
setLibcallName(RTLIB::COS_F32, "cos");
- setMinFunctionAlignment(1);
+ setMinFunctionAlignment(Align(2));
setMinimumJumpTableEntries(UINT_MAX);
}
@@ -1517,11 +1517,11 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
unsigned ShiftAmtReg = RI.createVirtualRegister(&AVR::LD8RegClass);
unsigned ShiftAmtReg2 = RI.createVirtualRegister(&AVR::LD8RegClass);
- unsigned ShiftReg = RI.createVirtualRegister(RC);
- unsigned ShiftReg2 = RI.createVirtualRegister(RC);
- unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register ShiftReg = RI.createVirtualRegister(RC);
+ Register ShiftReg2 = RI.createVirtualRegister(RC);
+ Register ShiftAmtSrcReg = MI.getOperand(2).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
// BB:
// cpi N, 0
@@ -1568,7 +1568,7 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
if (I->getOpcode() == AVR::COPY) {
- unsigned SrcReg = I->getOperand(1).getReg();
+ Register SrcReg = I->getOperand(1).getReg();
return (SrcReg == AVR::R0 || SrcReg == AVR::R1);
}
@@ -1689,6 +1689,8 @@ AVRTargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
// See http://www.nongnu.org/avr-libc/user-manual/inline_asm.html
switch (Constraint[0]) {
+ default:
+ break;
case 'a': // Simple upper registers
case 'b': // Base pointer registers pairs
case 'd': // Upper register
@@ -1715,9 +1717,7 @@ AVRTargetLowering::getConstraintType(StringRef Constraint) const {
case 'O': // Integer constant (Range: 8, 16, 24)
case 'P': // Integer constant (Range: 1)
case 'R': // Integer constant (Range: -6 to 5)x
- return C_Other;
- default:
- break;
+ return C_Immediate;
}
}
@@ -2006,10 +2006,9 @@ void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
-unsigned AVRTargetLowering::getRegisterByName(const char *RegName,
- EVT VT,
- SelectionDAG &DAG) const {
- unsigned Reg;
+Register AVRTargetLowering::getRegisterByName(const char *RegName, EVT VT,
+ const MachineFunction &MF) const {
+ Register Reg;
if (VT == MVT::i8) {
Reg = StringSwitch<unsigned>(RegName)
diff --git a/lib/Target/AVR/AVRISelLowering.h b/lib/Target/AVR/AVRISelLowering.h
index ed2d0835903c..6c722fa5414b 100644
--- a/lib/Target/AVR/AVRISelLowering.h
+++ b/lib/Target/AVR/AVRISelLowering.h
@@ -125,8 +125,8 @@ public:
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const override;
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
bool shouldSplitFunctionArgumentsAsLittleEndian(const DataLayout &DL)
const override {
diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
index a6b36f80485d..8fce05c933bc 100644
--- a/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -158,7 +158,7 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// We need to materialize the offset via an add instruction.
unsigned Opcode;
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
assert(DstReg != AVR::R29R28 && "Dest reg cannot be the frame pointer");
II++; // Skip over the FRMIDX (and now MOVW) instruction.
diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp
index a36c8b0f9649..25304280d002 100644
--- a/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/lib/Target/AVR/AVRTargetMachine.cpp
@@ -50,7 +50,7 @@ AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT,
getEffectiveRelocModel(RM),
getEffectiveCodeModel(CM, CodeModel::Small), OL),
SubTarget(TT, getCPU(CPU), FS, *this) {
- this->TLOF = make_unique<AVRTargetObjectFile>();
+ this->TLOF = std::make_unique<AVRTargetObjectFile>();
initAsmInfo();
}
diff --git a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index aac5644711e2..af60bc4fdc90 100644
--- a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -199,22 +199,22 @@ public:
}
static std::unique_ptr<AVROperand> CreateToken(StringRef Str, SMLoc S) {
- return make_unique<AVROperand>(Str, S);
+ return std::make_unique<AVROperand>(Str, S);
}
static std::unique_ptr<AVROperand> CreateReg(unsigned RegNum, SMLoc S,
SMLoc E) {
- return make_unique<AVROperand>(RegNum, S, E);
+ return std::make_unique<AVROperand>(RegNum, S, E);
}
static std::unique_ptr<AVROperand> CreateImm(const MCExpr *Val, SMLoc S,
SMLoc E) {
- return make_unique<AVROperand>(Val, S, E);
+ return std::make_unique<AVROperand>(Val, S, E);
}
static std::unique_ptr<AVROperand>
CreateMemri(unsigned RegNum, const MCExpr *Val, SMLoc S, SMLoc E) {
- return make_unique<AVROperand>(RegNum, Val, S, E);
+ return std::make_unique<AVROperand>(RegNum, Val, S, E);
}
void makeToken(StringRef Token) {
diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index 6025e4b2437c..1c69fea5962d 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -152,7 +152,7 @@ unsigned AVRELFObjectWriter::getRelocType(MCContext &Ctx,
}
std::unique_ptr<MCObjectTargetWriter> createAVRELFObjectWriter(uint8_t OSABI) {
- return make_unique<AVRELFObjectWriter>(OSABI);
+ return std::make_unique<AVRELFObjectWriter>(OSABI);
}
} // end of namespace llvm
diff --git a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 75885fd058a7..ce1d2ecd9d26 100644
--- a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -194,7 +194,7 @@ public:
}
static std::unique_ptr<BPFOperand> createToken(StringRef Str, SMLoc S) {
- auto Op = make_unique<BPFOperand>(Token);
+ auto Op = std::make_unique<BPFOperand>(Token);
Op->Tok = Str;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -203,7 +203,7 @@ public:
static std::unique_ptr<BPFOperand> createReg(unsigned RegNo, SMLoc S,
SMLoc E) {
- auto Op = make_unique<BPFOperand>(Register);
+ auto Op = std::make_unique<BPFOperand>(Register);
Op->Reg.RegNum = RegNo;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -212,7 +212,7 @@ public:
static std::unique_ptr<BPFOperand> createImm(const MCExpr *Val, SMLoc S,
SMLoc E) {
- auto Op = make_unique<BPFOperand>(Immediate);
+ auto Op = std::make_unique<BPFOperand>(Immediate);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
diff --git a/lib/Target/BPF/BPF.h b/lib/Target/BPF/BPF.h
index d311fc154094..6e4f35f4c5d7 100644
--- a/lib/Target/BPF/BPF.h
+++ b/lib/Target/BPF/BPF.h
@@ -15,17 +15,19 @@
namespace llvm {
class BPFTargetMachine;
-ModulePass *createBPFAbstractMemberAccess();
+ModulePass *createBPFAbstractMemberAccess(BPFTargetMachine *TM);
FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
FunctionPass *createBPFMISimplifyPatchablePass();
FunctionPass *createBPFMIPeepholePass();
+FunctionPass *createBPFMIPeepholeTruncElimPass();
FunctionPass *createBPFMIPreEmitPeepholePass();
FunctionPass *createBPFMIPreEmitCheckingPass();
void initializeBPFAbstractMemberAccessPass(PassRegistry&);
void initializeBPFMISimplifyPatchablePass(PassRegistry&);
void initializeBPFMIPeepholePass(PassRegistry&);
+void initializeBPFMIPeepholeTruncElimPass(PassRegistry&);
void initializeBPFMIPreEmitPeepholePass(PassRegistry&);
void initializeBPFMIPreEmitCheckingPass(PassRegistry&);
}
diff --git a/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index 51d4cbc8a429..400701c4e5c2 100644
--- a/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -50,6 +50,28 @@
// addr = preserve_struct_access_index(base, gep_index, di_index)
// !llvm.preserve.access.index <struct_ditype>
//
+// Bitfield member access needs special attention. User cannot take the
+// address of a bitfield acceess. To facilitate kernel verifier
+// for easy bitfield code optimization, a new clang intrinsic is introduced:
+// uint32_t __builtin_preserve_field_info(member_access, info_kind)
+// In IR, a chain with two (or more) intrinsic calls will be generated:
+// ...
+// addr = preserve_struct_access_index(base, 1, 1) !struct s
+// uint32_t result = bpf_preserve_field_info(addr, info_kind)
+//
+// Suppose the info_kind is FIELD_SIGNEDNESS,
+// The above two IR intrinsics will be replaced with
+// a relocatable insn:
+// signness = /* signness of member_access */
+// and signness can be changed by bpf loader based on the
+// types on the host.
+//
+// User can also test whether a field exists or not with
+// uint32_t result = bpf_preserve_field_info(member_access, FIELD_EXISTENCE)
+// The field will be always available (result = 1) during initial
+// compilation, but bpf loader can patch with the correct value
+// on the target host where the member_access may or may not be available
+//
//===----------------------------------------------------------------------===//
#include "BPF.h"
@@ -65,13 +87,12 @@
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <stack>
#define DEBUG_TYPE "bpf-abstract-member-access"
namespace llvm {
const std::string BPFCoreSharedInfo::AmaAttr = "btf_ama";
-const std::string BPFCoreSharedInfo::PatchableExtSecName =
- ".BPF.patchable_externs";
} // namespace llvm
using namespace llvm;
@@ -87,40 +108,62 @@ class BPFAbstractMemberAccess final : public ModulePass {
public:
static char ID;
- BPFAbstractMemberAccess() : ModulePass(ID) {}
+ TargetMachine *TM;
+ // Add optional BPFTargetMachine parameter so that BPF backend can add the phase
+ // with target machine to find out the endianness. The default constructor (without
+ // parameters) is used by the pass manager for managing purposes.
+ BPFAbstractMemberAccess(BPFTargetMachine *TM = nullptr) : ModulePass(ID), TM(TM) {}
+
+ struct CallInfo {
+ uint32_t Kind;
+ uint32_t AccessIndex;
+ MDNode *Metadata;
+ Value *Base;
+ };
+ typedef std::stack<std::pair<CallInst *, CallInfo>> CallInfoStack;
private:
enum : uint32_t {
BPFPreserveArrayAI = 1,
BPFPreserveUnionAI = 2,
BPFPreserveStructAI = 3,
+ BPFPreserveFieldInfoAI = 4,
};
std::map<std::string, GlobalVariable *> GEPGlobals;
// A map to link preserve_*_access_index instrinsic calls.
- std::map<CallInst *, std::pair<CallInst *, uint32_t>> AIChain;
+ std::map<CallInst *, std::pair<CallInst *, CallInfo>> AIChain;
// A map to hold all the base preserve_*_access_index instrinsic calls.
- // The base call is not an input of any other preserve_*_access_index
+ // The base call is not an input of any other preserve_*
// intrinsics.
- std::map<CallInst *, uint32_t> BaseAICalls;
+ std::map<CallInst *, CallInfo> BaseAICalls;
bool doTransformation(Module &M);
- void traceAICall(CallInst *Call, uint32_t Kind);
- void traceBitCast(BitCastInst *BitCast, CallInst *Parent, uint32_t Kind);
- void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, uint32_t Kind);
+ void traceAICall(CallInst *Call, CallInfo &ParentInfo);
+ void traceBitCast(BitCastInst *BitCast, CallInst *Parent,
+ CallInfo &ParentInfo);
+ void traceGEP(GetElementPtrInst *GEP, CallInst *Parent,
+ CallInfo &ParentInfo);
void collectAICallChains(Module &M, Function &F);
- bool IsPreserveDIAccessIndexCall(const CallInst *Call, uint32_t &Kind);
+ bool IsPreserveDIAccessIndexCall(const CallInst *Call, CallInfo &Cinfo);
+ bool IsValidAIChain(const MDNode *ParentMeta, uint32_t ParentAI,
+ const MDNode *ChildMeta);
bool removePreserveAccessIndexIntrinsic(Module &M);
void replaceWithGEP(std::vector<CallInst *> &CallList,
uint32_t NumOfZerosIndex, uint32_t DIIndex);
-
- Value *computeBaseAndAccessStr(CallInst *Call, std::string &AccessStr,
- std::string &AccessKey, uint32_t Kind,
- MDNode *&TypeMeta);
- bool getAccessIndex(const Value *IndexValue, uint64_t &AccessIndex);
- bool transformGEPChain(Module &M, CallInst *Call, uint32_t Kind);
+ bool HasPreserveFieldInfoCall(CallInfoStack &CallStack);
+ void GetStorageBitRange(DICompositeType *CTy, DIDerivedType *MemberTy,
+ uint32_t AccessIndex, uint32_t &StartBitOffset,
+ uint32_t &EndBitOffset);
+ uint32_t GetFieldInfo(uint32_t InfoKind, DICompositeType *CTy,
+ uint32_t AccessIndex, uint32_t PatchImm);
+
+ Value *computeBaseAndAccessKey(CallInst *Call, CallInfo &CInfo,
+ std::string &AccessKey, MDNode *&BaseMeta);
+ uint64_t getConstant(const Value *IndexValue);
+ bool transformGEPChain(Module &M, CallInst *Call, CallInfo &CInfo);
};
} // End anonymous namespace
@@ -128,23 +171,65 @@ char BPFAbstractMemberAccess::ID = 0;
INITIALIZE_PASS(BPFAbstractMemberAccess, DEBUG_TYPE,
"abstracting struct/union member accessees", false, false)
-ModulePass *llvm::createBPFAbstractMemberAccess() {
- return new BPFAbstractMemberAccess();
+ModulePass *llvm::createBPFAbstractMemberAccess(BPFTargetMachine *TM) {
+ return new BPFAbstractMemberAccess(TM);
}
bool BPFAbstractMemberAccess::runOnModule(Module &M) {
LLVM_DEBUG(dbgs() << "********** Abstract Member Accesses **********\n");
// Bail out if no debug info.
- if (empty(M.debug_compile_units()))
+ if (M.debug_compile_units().empty())
return false;
return doTransformation(M);
}
+static bool SkipDIDerivedTag(unsigned Tag) {
+ if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type &&
+ Tag != dwarf::DW_TAG_volatile_type &&
+ Tag != dwarf::DW_TAG_restrict_type &&
+ Tag != dwarf::DW_TAG_member)
+ return false;
+ return true;
+}
+
+static DIType * stripQualifiers(DIType *Ty) {
+ while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+ if (!SkipDIDerivedTag(DTy->getTag()))
+ break;
+ Ty = DTy->getBaseType();
+ }
+ return Ty;
+}
+
+static const DIType * stripQualifiers(const DIType *Ty) {
+ while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+ if (!SkipDIDerivedTag(DTy->getTag()))
+ break;
+ Ty = DTy->getBaseType();
+ }
+ return Ty;
+}
+
+static uint32_t calcArraySize(const DICompositeType *CTy, uint32_t StartDim) {
+ DINodeArray Elements = CTy->getElements();
+ uint32_t DimSize = 1;
+ for (uint32_t I = StartDim; I < Elements.size(); ++I) {
+ if (auto *Element = dyn_cast_or_null<DINode>(Elements[I]))
+ if (Element->getTag() == dwarf::DW_TAG_subrange_type) {
+ const DISubrange *SR = cast<DISubrange>(Element);
+ auto *CI = SR->getCount().dyn_cast<ConstantInt *>();
+ DimSize *= CI->getSExtValue();
+ }
+ }
+
+ return DimSize;
+}
+
/// Check whether a call is a preserve_*_access_index intrinsic call or not.
bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
- uint32_t &Kind) {
+ CallInfo &CInfo) {
if (!Call)
return false;
@@ -152,15 +237,40 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
if (!GV)
return false;
if (GV->getName().startswith("llvm.preserve.array.access.index")) {
- Kind = BPFPreserveArrayAI;
+ CInfo.Kind = BPFPreserveArrayAI;
+ CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+ if (!CInfo.Metadata)
+ report_fatal_error("Missing metadata for llvm.preserve.array.access.index intrinsic");
+ CInfo.AccessIndex = getConstant(Call->getArgOperand(2));
+ CInfo.Base = Call->getArgOperand(0);
return true;
}
if (GV->getName().startswith("llvm.preserve.union.access.index")) {
- Kind = BPFPreserveUnionAI;
+ CInfo.Kind = BPFPreserveUnionAI;
+ CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+ if (!CInfo.Metadata)
+ report_fatal_error("Missing metadata for llvm.preserve.union.access.index intrinsic");
+ CInfo.AccessIndex = getConstant(Call->getArgOperand(1));
+ CInfo.Base = Call->getArgOperand(0);
return true;
}
if (GV->getName().startswith("llvm.preserve.struct.access.index")) {
- Kind = BPFPreserveStructAI;
+ CInfo.Kind = BPFPreserveStructAI;
+ CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+ if (!CInfo.Metadata)
+ report_fatal_error("Missing metadata for llvm.preserve.struct.access.index intrinsic");
+ CInfo.AccessIndex = getConstant(Call->getArgOperand(2));
+ CInfo.Base = Call->getArgOperand(0);
+ return true;
+ }
+ if (GV->getName().startswith("llvm.bpf.preserve.field.info")) {
+ CInfo.Kind = BPFPreserveFieldInfoAI;
+ CInfo.Metadata = nullptr;
+ // Check validity of info_kind as clang did not check this.
+ uint64_t InfoKind = getConstant(Call->getArgOperand(1));
+ if (InfoKind >= BPFCoreSharedInfo::MAX_FIELD_RELOC_KIND)
+ report_fatal_error("Incorrect info_kind for llvm.bpf.preserve.field.info intrinsic");
+ CInfo.AccessIndex = InfoKind;
return true;
}
@@ -173,8 +283,7 @@ void BPFAbstractMemberAccess::replaceWithGEP(std::vector<CallInst *> &CallList,
for (auto Call : CallList) {
uint32_t Dimension = 1;
if (DimensionIndex > 0)
- Dimension = cast<ConstantInt>(Call->getArgOperand(DimensionIndex))
- ->getZExtValue();
+ Dimension = getConstant(Call->getArgOperand(DimensionIndex));
Constant *Zero =
ConstantInt::get(Type::getInt32Ty(Call->getParent()->getContext()), 0);
@@ -200,14 +309,14 @@ bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) {
for (auto &BB : F)
for (auto &I : BB) {
auto *Call = dyn_cast<CallInst>(&I);
- uint32_t Kind;
- if (!IsPreserveDIAccessIndexCall(Call, Kind))
+ CallInfo CInfo;
+ if (!IsPreserveDIAccessIndexCall(Call, CInfo))
continue;
Found = true;
- if (Kind == BPFPreserveArrayAI)
+ if (CInfo.Kind == BPFPreserveArrayAI)
PreserveArrayIndexCalls.push_back(Call);
- else if (Kind == BPFPreserveUnionAI)
+ else if (CInfo.Kind == BPFPreserveUnionAI)
PreserveUnionIndexCalls.push_back(Call);
else
PreserveStructIndexCalls.push_back(Call);
@@ -233,79 +342,146 @@ bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) {
return Found;
}
-void BPFAbstractMemberAccess::traceAICall(CallInst *Call, uint32_t Kind) {
+/// Check whether the access index chain is valid. We check
+/// here because there may be type casts between two
+/// access indexes. We want to ensure memory access still valid.
+bool BPFAbstractMemberAccess::IsValidAIChain(const MDNode *ParentType,
+ uint32_t ParentAI,
+ const MDNode *ChildType) {
+ if (!ChildType)
+ return true; // preserve_field_info, no type comparison needed.
+
+ const DIType *PType = stripQualifiers(cast<DIType>(ParentType));
+ const DIType *CType = stripQualifiers(cast<DIType>(ChildType));
+
+ // Child is a derived/pointer type, which is due to type casting.
+ // Pointer type cannot be in the middle of chain.
+ if (isa<DIDerivedType>(CType))
+ return false;
+
+ // Parent is a pointer type.
+ if (const auto *PtrTy = dyn_cast<DIDerivedType>(PType)) {
+ if (PtrTy->getTag() != dwarf::DW_TAG_pointer_type)
+ return false;
+ return stripQualifiers(PtrTy->getBaseType()) == CType;
+ }
+
+ // Otherwise, struct/union/array types
+ const auto *PTy = dyn_cast<DICompositeType>(PType);
+ const auto *CTy = dyn_cast<DICompositeType>(CType);
+ assert(PTy && CTy && "ParentType or ChildType is null or not composite");
+
+ uint32_t PTyTag = PTy->getTag();
+ assert(PTyTag == dwarf::DW_TAG_array_type ||
+ PTyTag == dwarf::DW_TAG_structure_type ||
+ PTyTag == dwarf::DW_TAG_union_type);
+
+ uint32_t CTyTag = CTy->getTag();
+ assert(CTyTag == dwarf::DW_TAG_array_type ||
+ CTyTag == dwarf::DW_TAG_structure_type ||
+ CTyTag == dwarf::DW_TAG_union_type);
+
+ // Multi dimensional arrays, base element should be the same
+ if (PTyTag == dwarf::DW_TAG_array_type && PTyTag == CTyTag)
+ return PTy->getBaseType() == CTy->getBaseType();
+
+ DIType *Ty;
+ if (PTyTag == dwarf::DW_TAG_array_type)
+ Ty = PTy->getBaseType();
+ else
+ Ty = dyn_cast<DIType>(PTy->getElements()[ParentAI]);
+
+ return dyn_cast<DICompositeType>(stripQualifiers(Ty)) == CTy;
+}
+
+void BPFAbstractMemberAccess::traceAICall(CallInst *Call,
+ CallInfo &ParentInfo) {
for (User *U : Call->users()) {
Instruction *Inst = dyn_cast<Instruction>(U);
if (!Inst)
continue;
if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
- traceBitCast(BI, Call, Kind);
+ traceBitCast(BI, Call, ParentInfo);
} else if (auto *CI = dyn_cast<CallInst>(Inst)) {
- uint32_t CIKind;
- if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
- AIChain[CI] = std::make_pair(Call, Kind);
- traceAICall(CI, CIKind);
+ CallInfo ChildInfo;
+
+ if (IsPreserveDIAccessIndexCall(CI, ChildInfo) &&
+ IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex,
+ ChildInfo.Metadata)) {
+ AIChain[CI] = std::make_pair(Call, ParentInfo);
+ traceAICall(CI, ChildInfo);
} else {
- BaseAICalls[Call] = Kind;
+ BaseAICalls[Call] = ParentInfo;
}
} else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
if (GI->hasAllZeroIndices())
- traceGEP(GI, Call, Kind);
+ traceGEP(GI, Call, ParentInfo);
else
- BaseAICalls[Call] = Kind;
+ BaseAICalls[Call] = ParentInfo;
+ } else {
+ BaseAICalls[Call] = ParentInfo;
}
}
}
void BPFAbstractMemberAccess::traceBitCast(BitCastInst *BitCast,
- CallInst *Parent, uint32_t Kind) {
+ CallInst *Parent,
+ CallInfo &ParentInfo) {
for (User *U : BitCast->users()) {
Instruction *Inst = dyn_cast<Instruction>(U);
if (!Inst)
continue;
if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
- traceBitCast(BI, Parent, Kind);
+ traceBitCast(BI, Parent, ParentInfo);
} else if (auto *CI = dyn_cast<CallInst>(Inst)) {
- uint32_t CIKind;
- if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
- AIChain[CI] = std::make_pair(Parent, Kind);
- traceAICall(CI, CIKind);
+ CallInfo ChildInfo;
+ if (IsPreserveDIAccessIndexCall(CI, ChildInfo) &&
+ IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex,
+ ChildInfo.Metadata)) {
+ AIChain[CI] = std::make_pair(Parent, ParentInfo);
+ traceAICall(CI, ChildInfo);
} else {
- BaseAICalls[Parent] = Kind;
+ BaseAICalls[Parent] = ParentInfo;
}
} else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
if (GI->hasAllZeroIndices())
- traceGEP(GI, Parent, Kind);
+ traceGEP(GI, Parent, ParentInfo);
else
- BaseAICalls[Parent] = Kind;
+ BaseAICalls[Parent] = ParentInfo;
+ } else {
+ BaseAICalls[Parent] = ParentInfo;
}
}
}
void BPFAbstractMemberAccess::traceGEP(GetElementPtrInst *GEP, CallInst *Parent,
- uint32_t Kind) {
+ CallInfo &ParentInfo) {
for (User *U : GEP->users()) {
Instruction *Inst = dyn_cast<Instruction>(U);
if (!Inst)
continue;
if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
- traceBitCast(BI, Parent, Kind);
+ traceBitCast(BI, Parent, ParentInfo);
} else if (auto *CI = dyn_cast<CallInst>(Inst)) {
- uint32_t CIKind;
- if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
- AIChain[CI] = std::make_pair(Parent, Kind);
- traceAICall(CI, CIKind);
+ CallInfo ChildInfo;
+ if (IsPreserveDIAccessIndexCall(CI, ChildInfo) &&
+ IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex,
+ ChildInfo.Metadata)) {
+ AIChain[CI] = std::make_pair(Parent, ParentInfo);
+ traceAICall(CI, ChildInfo);
} else {
- BaseAICalls[Parent] = Kind;
+ BaseAICalls[Parent] = ParentInfo;
}
} else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
if (GI->hasAllZeroIndices())
- traceGEP(GI, Parent, Kind);
+ traceGEP(GI, Parent, ParentInfo);
else
- BaseAICalls[Parent] = Kind;
+ BaseAICalls[Parent] = ParentInfo;
+ } else {
+ BaseAICalls[Parent] = ParentInfo;
}
}
}
@@ -316,92 +492,345 @@ void BPFAbstractMemberAccess::collectAICallChains(Module &M, Function &F) {
for (auto &BB : F)
for (auto &I : BB) {
- uint32_t Kind;
+ CallInfo CInfo;
auto *Call = dyn_cast<CallInst>(&I);
- if (!IsPreserveDIAccessIndexCall(Call, Kind) ||
+ if (!IsPreserveDIAccessIndexCall(Call, CInfo) ||
AIChain.find(Call) != AIChain.end())
continue;
- traceAICall(Call, Kind);
+ traceAICall(Call, CInfo);
}
}
-/// Get access index from the preserve_*_access_index intrinsic calls.
-bool BPFAbstractMemberAccess::getAccessIndex(const Value *IndexValue,
- uint64_t &AccessIndex) {
+uint64_t BPFAbstractMemberAccess::getConstant(const Value *IndexValue) {
const ConstantInt *CV = dyn_cast<ConstantInt>(IndexValue);
- if (!CV)
- return false;
+ assert(CV);
+ return CV->getValue().getZExtValue();
+}
- AccessIndex = CV->getValue().getZExtValue();
- return true;
+/// Get the start and the end of storage offset for \p MemberTy.
+/// The storage bits are corresponding to the LLVM internal types,
+/// and the storage bits for the member determines what load width
+/// to use in order to extract the bitfield value.
+void BPFAbstractMemberAccess::GetStorageBitRange(DICompositeType *CTy,
+ DIDerivedType *MemberTy,
+ uint32_t AccessIndex,
+ uint32_t &StartBitOffset,
+ uint32_t &EndBitOffset) {
+ auto SOff = dyn_cast<ConstantInt>(MemberTy->getStorageOffsetInBits());
+ assert(SOff);
+ StartBitOffset = SOff->getZExtValue();
+
+ EndBitOffset = CTy->getSizeInBits();
+ uint32_t Index = AccessIndex + 1;
+ for (; Index < CTy->getElements().size(); ++Index) {
+ auto Member = cast<DIDerivedType>(CTy->getElements()[Index]);
+ if (!Member->getStorageOffsetInBits()) {
+ EndBitOffset = Member->getOffsetInBits();
+ break;
+ }
+ SOff = dyn_cast<ConstantInt>(Member->getStorageOffsetInBits());
+ assert(SOff);
+ unsigned BitOffset = SOff->getZExtValue();
+ if (BitOffset != StartBitOffset) {
+ EndBitOffset = BitOffset;
+ break;
+ }
+ }
+}
+
+uint32_t BPFAbstractMemberAccess::GetFieldInfo(uint32_t InfoKind,
+ DICompositeType *CTy,
+ uint32_t AccessIndex,
+ uint32_t PatchImm) {
+ if (InfoKind == BPFCoreSharedInfo::FIELD_EXISTENCE)
+ return 1;
+
+ uint32_t Tag = CTy->getTag();
+ if (InfoKind == BPFCoreSharedInfo::FIELD_BYTE_OFFSET) {
+ if (Tag == dwarf::DW_TAG_array_type) {
+ auto *EltTy = stripQualifiers(CTy->getBaseType());
+ PatchImm += AccessIndex * calcArraySize(CTy, 1) *
+ (EltTy->getSizeInBits() >> 3);
+ } else if (Tag == dwarf::DW_TAG_structure_type) {
+ auto *MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]);
+ if (!MemberTy->isBitField()) {
+ PatchImm += MemberTy->getOffsetInBits() >> 3;
+ } else {
+ auto SOffset = dyn_cast<ConstantInt>(MemberTy->getStorageOffsetInBits());
+ assert(SOffset);
+ PatchImm += SOffset->getZExtValue() >> 3;
+ }
+ }
+ return PatchImm;
+ }
+
+ if (InfoKind == BPFCoreSharedInfo::FIELD_BYTE_SIZE) {
+ if (Tag == dwarf::DW_TAG_array_type) {
+ auto *EltTy = stripQualifiers(CTy->getBaseType());
+ return calcArraySize(CTy, 1) * (EltTy->getSizeInBits() >> 3);
+ } else {
+ auto *MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]);
+ uint32_t SizeInBits = MemberTy->getSizeInBits();
+ if (!MemberTy->isBitField())
+ return SizeInBits >> 3;
+
+ unsigned SBitOffset, NextSBitOffset;
+ GetStorageBitRange(CTy, MemberTy, AccessIndex, SBitOffset, NextSBitOffset);
+ SizeInBits = NextSBitOffset - SBitOffset;
+ if (SizeInBits & (SizeInBits - 1))
+ report_fatal_error("Unsupported field expression for llvm.bpf.preserve.field.info");
+ return SizeInBits >> 3;
+ }
+ }
+
+ if (InfoKind == BPFCoreSharedInfo::FIELD_SIGNEDNESS) {
+ const DIType *BaseTy;
+ if (Tag == dwarf::DW_TAG_array_type) {
+ // Signedness only checked when final array elements are accessed.
+ if (CTy->getElements().size() != 1)
+ report_fatal_error("Invalid array expression for llvm.bpf.preserve.field.info");
+ BaseTy = stripQualifiers(CTy->getBaseType());
+ } else {
+ auto *MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]);
+ BaseTy = stripQualifiers(MemberTy->getBaseType());
+ }
+
+ // Only basic types and enum types have signedness.
+ const auto *BTy = dyn_cast<DIBasicType>(BaseTy);
+ while (!BTy) {
+ const auto *CompTy = dyn_cast<DICompositeType>(BaseTy);
+ // Report an error if the field expression does not have signedness.
+ if (!CompTy || CompTy->getTag() != dwarf::DW_TAG_enumeration_type)
+ report_fatal_error("Invalid field expression for llvm.bpf.preserve.field.info");
+ BaseTy = stripQualifiers(CompTy->getBaseType());
+ BTy = dyn_cast<DIBasicType>(BaseTy);
+ }
+ uint32_t Encoding = BTy->getEncoding();
+ return (Encoding == dwarf::DW_ATE_signed || Encoding == dwarf::DW_ATE_signed_char);
+ }
+
+ if (InfoKind == BPFCoreSharedInfo::FIELD_LSHIFT_U64) {
+ // The value is loaded into a value with FIELD_BYTE_SIZE size,
+ // and then zero or sign extended to U64.
+ // FIELD_LSHIFT_U64 and FIELD_RSHIFT_U64 are operations
+ // to extract the original value.
+ const Triple &Triple = TM->getTargetTriple();
+ DIDerivedType *MemberTy = nullptr;
+ bool IsBitField = false;
+ uint32_t SizeInBits;
+
+ if (Tag == dwarf::DW_TAG_array_type) {
+ auto *EltTy = stripQualifiers(CTy->getBaseType());
+ SizeInBits = calcArraySize(CTy, 1) * EltTy->getSizeInBits();
+ } else {
+ MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]);
+ SizeInBits = MemberTy->getSizeInBits();
+ IsBitField = MemberTy->isBitField();
+ }
+
+ if (!IsBitField) {
+ if (SizeInBits > 64)
+ report_fatal_error("too big field size for llvm.bpf.preserve.field.info");
+ return 64 - SizeInBits;
+ }
+
+ unsigned SBitOffset, NextSBitOffset;
+ GetStorageBitRange(CTy, MemberTy, AccessIndex, SBitOffset, NextSBitOffset);
+ if (NextSBitOffset - SBitOffset > 64)
+ report_fatal_error("too big field size for llvm.bpf.preserve.field.info");
+
+ unsigned OffsetInBits = MemberTy->getOffsetInBits();
+ if (Triple.getArch() == Triple::bpfel)
+ return SBitOffset + 64 - OffsetInBits - SizeInBits;
+ else
+ return OffsetInBits + 64 - NextSBitOffset;
+ }
+
+ if (InfoKind == BPFCoreSharedInfo::FIELD_RSHIFT_U64) {
+ DIDerivedType *MemberTy = nullptr;
+ bool IsBitField = false;
+ uint32_t SizeInBits;
+ if (Tag == dwarf::DW_TAG_array_type) {
+ auto *EltTy = stripQualifiers(CTy->getBaseType());
+ SizeInBits = calcArraySize(CTy, 1) * EltTy->getSizeInBits();
+ } else {
+ MemberTy = cast<DIDerivedType>(CTy->getElements()[AccessIndex]);
+ SizeInBits = MemberTy->getSizeInBits();
+ IsBitField = MemberTy->isBitField();
+ }
+
+ if (!IsBitField) {
+ if (SizeInBits > 64)
+ report_fatal_error("too big field size for llvm.bpf.preserve.field.info");
+ return 64 - SizeInBits;
+ }
+
+ unsigned SBitOffset, NextSBitOffset;
+ GetStorageBitRange(CTy, MemberTy, AccessIndex, SBitOffset, NextSBitOffset);
+ if (NextSBitOffset - SBitOffset > 64)
+ report_fatal_error("too big field size for llvm.bpf.preserve.field.info");
+
+ return 64 - SizeInBits;
+ }
+
+ llvm_unreachable("Unknown llvm.bpf.preserve.field.info info kind");
}
-/// Compute the base of the whole preserve_*_access_index chains, i.e., the base
+bool BPFAbstractMemberAccess::HasPreserveFieldInfoCall(CallInfoStack &CallStack) {
+ // This is called in error return path, no need to maintain CallStack.
+ while (CallStack.size()) {
+ auto StackElem = CallStack.top();
+ if (StackElem.second.Kind == BPFPreserveFieldInfoAI)
+ return true;
+ CallStack.pop();
+ }
+ return false;
+}
+
+/// Compute the base of the whole preserve_* intrinsics chains, i.e., the base
/// pointer of the first preserve_*_access_index call, and construct the access
/// string, which will be the name of a global variable.
-Value *BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst *Call,
- std::string &AccessStr,
+Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call,
+ CallInfo &CInfo,
std::string &AccessKey,
- uint32_t Kind,
MDNode *&TypeMeta) {
Value *Base = nullptr;
- std::vector<uint64_t> AccessIndices;
- uint64_t TypeNameIndex = 0;
- std::string LastTypeName;
+ std::string TypeName;
+ CallInfoStack CallStack;
+ // Put the access chain into a stack with the top as the head of the chain.
while (Call) {
- // Base of original corresponding GEP
- Base = Call->getArgOperand(0);
-
- // Type Name
- std::string TypeName;
- MDNode *MDN;
- if (Kind == BPFPreserveUnionAI || Kind == BPFPreserveStructAI) {
- MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index);
- if (!MDN)
- return nullptr;
-
- DIType *Ty = dyn_cast<DIType>(MDN);
- if (!Ty)
- return nullptr;
+ CallStack.push(std::make_pair(Call, CInfo));
+ CInfo = AIChain[Call].second;
+ Call = AIChain[Call].first;
+ }
+ // The access offset from the base of the head of chain is also
+ // calculated here as all debuginfo types are available.
+
+ // Get type name and calculate the first index.
+ // We only want to get type name from structure or union.
+ // If user wants a relocation like
+ // int *p; ... __builtin_preserve_access_index(&p[4]) ...
+ // or
+ // int a[10][20]; ... __builtin_preserve_access_index(&a[2][3]) ...
+ // we will skip them.
+ uint32_t FirstIndex = 0;
+ uint32_t PatchImm = 0; // AccessOffset or the requested field info
+ uint32_t InfoKind = BPFCoreSharedInfo::FIELD_BYTE_OFFSET;
+ while (CallStack.size()) {
+ auto StackElem = CallStack.top();
+ Call = StackElem.first;
+ CInfo = StackElem.second;
+
+ if (!Base)
+ Base = CInfo.Base;
+
+ DIType *Ty = stripQualifiers(cast<DIType>(CInfo.Metadata));
+ if (CInfo.Kind == BPFPreserveUnionAI ||
+ CInfo.Kind == BPFPreserveStructAI) {
+ // struct or union type
TypeName = Ty->getName();
+ TypeMeta = Ty;
+ PatchImm += FirstIndex * (Ty->getSizeInBits() >> 3);
+ break;
}
- // Access Index
- uint64_t AccessIndex;
- uint32_t ArgIndex = (Kind == BPFPreserveUnionAI) ? 1 : 2;
- if (!getAccessIndex(Call->getArgOperand(ArgIndex), AccessIndex))
- return nullptr;
-
- AccessIndices.push_back(AccessIndex);
- if (TypeName.size()) {
- TypeNameIndex = AccessIndices.size() - 1;
- LastTypeName = TypeName;
- TypeMeta = MDN;
+ assert(CInfo.Kind == BPFPreserveArrayAI);
+
+ // Array entries will always be consumed for accumulative initial index.
+ CallStack.pop();
+
+ // BPFPreserveArrayAI
+ uint64_t AccessIndex = CInfo.AccessIndex;
+
+ DIType *BaseTy = nullptr;
+ bool CheckElemType = false;
+ if (const auto *CTy = dyn_cast<DICompositeType>(Ty)) {
+ // array type
+ assert(CTy->getTag() == dwarf::DW_TAG_array_type);
+
+
+ FirstIndex += AccessIndex * calcArraySize(CTy, 1);
+ BaseTy = stripQualifiers(CTy->getBaseType());
+ CheckElemType = CTy->getElements().size() == 1;
+ } else {
+ // pointer type
+ auto *DTy = cast<DIDerivedType>(Ty);
+ assert(DTy->getTag() == dwarf::DW_TAG_pointer_type);
+
+ BaseTy = stripQualifiers(DTy->getBaseType());
+ CTy = dyn_cast<DICompositeType>(BaseTy);
+ if (!CTy) {
+ CheckElemType = true;
+ } else if (CTy->getTag() != dwarf::DW_TAG_array_type) {
+ FirstIndex += AccessIndex;
+ CheckElemType = true;
+ } else {
+ FirstIndex += AccessIndex * calcArraySize(CTy, 0);
+ }
}
- Kind = AIChain[Call].second;
- Call = AIChain[Call].first;
- }
+ if (CheckElemType) {
+ auto *CTy = dyn_cast<DICompositeType>(BaseTy);
+ if (!CTy) {
+ if (HasPreserveFieldInfoCall(CallStack))
+ report_fatal_error("Invalid field access for llvm.preserve.field.info intrinsic");
+ return nullptr;
+ }
- // The intial type name is required.
- // FIXME: if the initial type access is an array index, e.g.,
- // &a[3].b.c, only one dimentional array is supported.
- if (!LastTypeName.size() || AccessIndices.size() > TypeNameIndex + 2)
- return nullptr;
+ unsigned CTag = CTy->getTag();
+ if (CTag == dwarf::DW_TAG_structure_type || CTag == dwarf::DW_TAG_union_type) {
+ TypeName = CTy->getName();
+ } else {
+ if (HasPreserveFieldInfoCall(CallStack))
+ report_fatal_error("Invalid field access for llvm.preserve.field.info intrinsic");
+ return nullptr;
+ }
+ TypeMeta = CTy;
+ PatchImm += FirstIndex * (CTy->getSizeInBits() >> 3);
+ break;
+ }
+ }
+ assert(TypeName.size());
+ AccessKey += std::to_string(FirstIndex);
+
+ // Traverse the rest of access chain to complete offset calculation
+ // and access key construction.
+ while (CallStack.size()) {
+ auto StackElem = CallStack.top();
+ CInfo = StackElem.second;
+ CallStack.pop();
+
+ if (CInfo.Kind == BPFPreserveFieldInfoAI)
+ break;
+
+ // If the next Call (the top of the stack) is a BPFPreserveFieldInfoAI,
+ // the action will be extracting field info.
+ if (CallStack.size()) {
+ auto StackElem2 = CallStack.top();
+ CallInfo CInfo2 = StackElem2.second;
+ if (CInfo2.Kind == BPFPreserveFieldInfoAI) {
+ InfoKind = CInfo2.AccessIndex;
+ assert(CallStack.size() == 1);
+ }
+ }
- // Construct the type string AccessStr.
- for (unsigned I = 0; I < AccessIndices.size(); ++I)
- AccessStr = std::to_string(AccessIndices[I]) + ":" + AccessStr;
+ // Access Index
+ uint64_t AccessIndex = CInfo.AccessIndex;
+ AccessKey += ":" + std::to_string(AccessIndex);
- if (TypeNameIndex == AccessIndices.size() - 1)
- AccessStr = "0:" + AccessStr;
+ MDNode *MDN = CInfo.Metadata;
+ // At this stage, it cannot be pointer type.
+ auto *CTy = cast<DICompositeType>(stripQualifiers(cast<DIType>(MDN)));
+ PatchImm = GetFieldInfo(InfoKind, CTy, AccessIndex, PatchImm);
+ }
- // Access key is the type name + access string, uniquely identifying
- // one kernel memory access.
- AccessKey = LastTypeName + ":" + AccessStr;
+ // Access key is the type name + reloc type + patched imm + access string,
+ // uniquely identifying one relocation.
+ AccessKey = TypeName + ":" + std::to_string(InfoKind) + ":" +
+ std::to_string(PatchImm) + "$" + AccessKey;
return Base;
}
@@ -409,39 +838,52 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst *Call,
/// Call/Kind is the base preserve_*_access_index() call. Attempts to do
/// transformation to a chain of relocable GEPs.
bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
- uint32_t Kind) {
- std::string AccessStr, AccessKey;
- MDNode *TypeMeta = nullptr;
+ CallInfo &CInfo) {
+ std::string AccessKey;
+ MDNode *TypeMeta;
Value *Base =
- computeBaseAndAccessStr(Call, AccessStr, AccessKey, Kind, TypeMeta);
+ computeBaseAndAccessKey(Call, CInfo, AccessKey, TypeMeta);
if (!Base)
return false;
- // Do the transformation
- // For any original GEP Call and Base %2 like
- // %4 = bitcast %struct.net_device** %dev1 to i64*
- // it is transformed to:
- // %6 = load __BTF_0:sk_buff:0:0:2:0:
- // %7 = bitcast %struct.sk_buff* %2 to i8*
- // %8 = getelementptr i8, i8* %7, %6
- // %9 = bitcast i8* %8 to i64*
- // using %9 instead of %4
- // The original Call inst is removed.
BasicBlock *BB = Call->getParent();
GlobalVariable *GV;
if (GEPGlobals.find(AccessKey) == GEPGlobals.end()) {
- GV = new GlobalVariable(M, Type::getInt64Ty(BB->getContext()), false,
- GlobalVariable::ExternalLinkage, NULL, AccessStr);
+ IntegerType *VarType;
+ if (CInfo.Kind == BPFPreserveFieldInfoAI)
+ VarType = Type::getInt32Ty(BB->getContext()); // 32bit return value
+ else
+ VarType = Type::getInt64Ty(BB->getContext()); // 64bit ptr arith
+
+ GV = new GlobalVariable(M, VarType, false, GlobalVariable::ExternalLinkage,
+ NULL, AccessKey);
GV->addAttribute(BPFCoreSharedInfo::AmaAttr);
- // Set the metadata (debuginfo types) for the global.
- if (TypeMeta)
- GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta);
+ GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta);
GEPGlobals[AccessKey] = GV;
} else {
GV = GEPGlobals[AccessKey];
}
+ if (CInfo.Kind == BPFPreserveFieldInfoAI) {
+ // Load the global variable which represents the returned field info.
+ auto *LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV);
+ BB->getInstList().insert(Call->getIterator(), LDInst);
+ Call->replaceAllUsesWith(LDInst);
+ Call->eraseFromParent();
+ return true;
+ }
+
+ // For any original GEP Call and Base %2 like
+ // %4 = bitcast %struct.net_device** %dev1 to i64*
+ // it is transformed to:
+ // %6 = load sk_buff:50:$0:0:0:2:0
+ // %7 = bitcast %struct.sk_buff* %2 to i8*
+ // %8 = getelementptr i8, i8* %7, %6
+ // %9 = bitcast i8* %8 to i64*
+ // using %9 instead of %4
+ // The original Call inst is removed.
+
// Load the global variable.
auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV);
BB->getInstList().insert(Call->getIterator(), LDInst);
diff --git a/lib/Target/BPF/BPFAsmPrinter.cpp b/lib/Target/BPF/BPFAsmPrinter.cpp
index e61e73468057..218b0302927c 100644
--- a/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -59,7 +59,7 @@ bool BPFAsmPrinter::doInitialization(Module &M) {
AsmPrinter::doInitialization(M);
// Only emit BTF when debuginfo available.
- if (MAI->doesSupportDebugInformation() && !empty(M.debug_compile_units())) {
+ if (MAI->doesSupportDebugInformation() && !M.debug_compile_units().empty()) {
BTF = new BTFDebug(this);
Handlers.push_back(HandlerInfo(std::unique_ptr<BTFDebug>(BTF), "emit",
"Debug Info Emission", "BTF",
diff --git a/lib/Target/BPF/BPFCORE.h b/lib/Target/BPF/BPFCORE.h
index e0950d95f8d7..ed4778353e52 100644
--- a/lib/Target/BPF/BPFCORE.h
+++ b/lib/Target/BPF/BPFCORE.h
@@ -13,10 +13,18 @@ namespace llvm {
class BPFCoreSharedInfo {
public:
- /// The attribute attached to globals representing a member offset
+ enum OffsetRelocKind : uint32_t {
+ FIELD_BYTE_OFFSET = 0,
+ FIELD_BYTE_SIZE,
+ FIELD_EXISTENCE,
+ FIELD_SIGNEDNESS,
+ FIELD_LSHIFT_U64,
+ FIELD_RSHIFT_U64,
+
+ MAX_FIELD_RELOC_KIND,
+ };
+ /// The attribute attached to globals representing a field access
static const std::string AmaAttr;
- /// The section name to identify a patchable external global
- static const std::string PatchableExtSecName;
};
} // namespace llvm
diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h
index 2dc6277d2244..a546351ec6cb 100644
--- a/lib/Target/BPF/BPFFrameLowering.h
+++ b/lib/Target/BPF/BPFFrameLowering.h
@@ -21,7 +21,7 @@ class BPFSubtarget;
class BPFFrameLowering : public TargetFrameLowering {
public:
explicit BPFFrameLowering(const BPFSubtarget &sti)
- : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0) {}
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(8), 0) {}
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 1bd705c55188..f2be0ff070d2 100644
--- a/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -45,9 +45,7 @@ class BPFDAGToDAGISel : public SelectionDAGISel {
public:
explicit BPFDAGToDAGISel(BPFTargetMachine &TM)
- : SelectionDAGISel(TM), Subtarget(nullptr) {
- curr_func_ = nullptr;
- }
+ : SelectionDAGISel(TM), Subtarget(nullptr) {}
StringRef getPassName() const override {
return "BPF DAG->DAG Pattern Instruction Selection";
@@ -92,14 +90,8 @@ private:
val_vec_type &Vals, int Offset);
bool getConstantFieldValue(const GlobalAddressSDNode *Node, uint64_t Offset,
uint64_t Size, unsigned char *ByteSeq);
- bool checkLoadDef(unsigned DefReg, unsigned match_load_op);
-
// Mapping from ConstantStruct global value to corresponding byte-list values
std::map<const void *, val_vec_type> cs_vals_;
- // Mapping from vreg to load memory opcode
- std::map<unsigned, unsigned> load_to_vreg_;
- // Current function
- const Function *curr_func_;
};
} // namespace
@@ -325,32 +317,13 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
}
void BPFDAGToDAGISel::PreprocessISelDAG() {
- // Iterate through all nodes, interested in the following cases:
+ // Iterate through all nodes, interested in the following case:
//
// . loads from ConstantStruct or ConstantArray of constructs
// which can be turns into constant itself, with this we can
// avoid reading from read-only section at runtime.
//
- // . reg truncating is often the result of 8/16/32bit->64bit or
- // 8/16bit->32bit conversion. If the reg value is loaded with
- // masked byte width, the AND operation can be removed since
- // BPF LOAD already has zero extension.
- //
- // This also solved a correctness issue.
- // In BPF socket-related program, e.g., __sk_buff->{data, data_end}
- // are 32-bit registers, but later on, kernel verifier will rewrite
- // it with 64-bit value. Therefore, truncating the value after the
- // load will result in incorrect code.
-
- // clear the load_to_vreg_ map so that we have a clean start
- // for this function.
- if (!curr_func_) {
- curr_func_ = FuncInfo->Fn;
- } else if (curr_func_ != FuncInfo->Fn) {
- load_to_vreg_.clear();
- curr_func_ = FuncInfo->Fn;
- }
-
+ // . Removing redundant AND for intrinsic narrow loads.
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end();
I != E;) {
@@ -358,8 +331,6 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
unsigned Opcode = Node->getOpcode();
if (Opcode == ISD::LOAD)
PreprocessLoad(Node, I);
- else if (Opcode == ISD::CopyToReg)
- PreprocessCopyToReg(Node);
else if (Opcode == ISD::AND)
PreprocessTrunc(Node, I);
}
@@ -491,37 +462,6 @@ bool BPFDAGToDAGISel::fillConstantStruct(const DataLayout &DL,
return true;
}
-void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) {
- const RegisterSDNode *RegN = dyn_cast<RegisterSDNode>(Node->getOperand(1));
- if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
- return;
-
- const LoadSDNode *LD = dyn_cast<LoadSDNode>(Node->getOperand(2));
- if (!LD)
- return;
-
- // Assign a load value to a virtual register. record its load width
- unsigned mem_load_op = 0;
- switch (LD->getMemOperand()->getSize()) {
- default:
- return;
- case 4:
- mem_load_op = BPF::LDW;
- break;
- case 2:
- mem_load_op = BPF::LDH;
- break;
- case 1:
- mem_load_op = BPF::LDB;
- break;
- }
-
- LLVM_DEBUG(dbgs() << "Find Load Value to VReg "
- << TargetRegisterInfo::virtReg2Index(RegN->getReg())
- << '\n');
- load_to_vreg_[RegN->getReg()] = mem_load_op;
-}
-
void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
SelectionDAG::allnodes_iterator &I) {
ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1));
@@ -535,112 +475,26 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
// which the generic optimizer doesn't understand their results are
// zero extended.
SDValue BaseV = Node->getOperand(0);
- if (BaseV.getOpcode() == ISD::INTRINSIC_W_CHAIN) {
- unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue();
- uint64_t MaskV = MaskN->getZExtValue();
-
- if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) ||
- (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) ||
- (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF)))
- return;
-
- LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: ";
- Node->dump(); dbgs() << '\n');
-
- I--;
- CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
- I++;
- CurDAG->DeleteNode(Node);
-
- return;
- }
-
- // Multiple basic blocks case.
- if (BaseV.getOpcode() != ISD::CopyFromReg)
+ if (BaseV.getOpcode() != ISD::INTRINSIC_W_CHAIN)
return;
- unsigned match_load_op = 0;
- switch (MaskN->getZExtValue()) {
- default:
- return;
- case 0xFFFFFFFF:
- match_load_op = BPF::LDW;
- break;
- case 0xFFFF:
- match_load_op = BPF::LDH;
- break;
- case 0xFF:
- match_load_op = BPF::LDB;
- break;
- }
+ unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue();
+ uint64_t MaskV = MaskN->getZExtValue();
- const RegisterSDNode *RegN =
- dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1));
- if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
+ if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) ||
+ (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) ||
+ (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF)))
return;
- unsigned AndOpReg = RegN->getReg();
- LLVM_DEBUG(dbgs() << "Examine " << printReg(AndOpReg) << '\n');
-
- // Examine the PHI insns in the MachineBasicBlock to found out the
- // definitions of this virtual register. At this stage (DAG2DAG
- // transformation), only PHI machine insns are available in the machine basic
- // block.
- MachineBasicBlock *MBB = FuncInfo->MBB;
- MachineInstr *MII = nullptr;
- for (auto &MI : *MBB) {
- for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
- const MachineOperand &MOP = MI.getOperand(i);
- if (!MOP.isReg() || !MOP.isDef())
- continue;
- unsigned Reg = MOP.getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg) && Reg == AndOpReg) {
- MII = &MI;
- break;
- }
- }
- }
-
- if (MII == nullptr) {
- // No phi definition in this block.
- if (!checkLoadDef(AndOpReg, match_load_op))
- return;
- } else {
- // The PHI node looks like:
- // %2 = PHI %0, <%bb.1>, %1, <%bb.3>
- // Trace each incoming definition, e.g., (%0, %bb.1) and (%1, %bb.3)
- // The AND operation can be removed if both %0 in %bb.1 and %1 in
- // %bb.3 are defined with a load matching the MaskN.
- LLVM_DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
- unsigned PrevReg = -1;
- for (unsigned i = 0; i < MII->getNumOperands(); ++i) {
- const MachineOperand &MOP = MII->getOperand(i);
- if (MOP.isReg()) {
- if (MOP.isDef())
- continue;
- PrevReg = MOP.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(PrevReg))
- return;
- if (!checkLoadDef(PrevReg, match_load_op))
- return;
- }
- }
- }
- LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump();
- dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: ";
+ Node->dump(); dbgs() << '\n');
I--;
CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
I++;
CurDAG->DeleteNode(Node);
-}
-
-bool BPFDAGToDAGISel::checkLoadDef(unsigned DefReg, unsigned match_load_op) {
- auto it = load_to_vreg_.find(DefReg);
- if (it == load_to_vreg_.end())
- return false; // The definition of register is not exported yet.
- return it->second == match_load_op;
+ return;
}
FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index ff69941d26fb..56e0288f26c9 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -132,9 +132,9 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
setBooleanContents(ZeroOrOneBooleanContent);
- // Function alignments (log2)
- setMinFunctionAlignment(3);
- setPrefFunctionAlignment(3);
+ // Function alignments
+ setMinFunctionAlignment(Align(8));
+ setPrefFunctionAlignment(Align(8));
if (BPFExpandMemcpyInOrder) {
// LLVM generic code will try to expand memcpy into load/store pairs at this
@@ -236,9 +236,8 @@ SDValue BPFTargetLowering::LowerFormalArguments(
}
case MVT::i32:
case MVT::i64:
- unsigned VReg = RegInfo.createVirtualRegister(SimpleTy == MVT::i64 ?
- &BPF::GPRRegClass :
- &BPF::GPR32RegClass);
+ Register VReg = RegInfo.createVirtualRegister(
+ SimpleTy == MVT::i64 ? &BPF::GPRRegClass : &BPF::GPR32RegClass);
RegInfo.addLiveIn(VA.getLocReg(), VReg);
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
@@ -571,9 +570,9 @@ BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
DebugLoc DL = MI.getDebugLoc();
MachineRegisterInfo &RegInfo = F->getRegInfo();
- unsigned PromotedReg0 = RegInfo.createVirtualRegister(RC);
- unsigned PromotedReg1 = RegInfo.createVirtualRegister(RC);
- unsigned PromotedReg2 = RegInfo.createVirtualRegister(RC);
+ Register PromotedReg0 = RegInfo.createVirtualRegister(RC);
+ Register PromotedReg1 = RegInfo.createVirtualRegister(RC);
+ Register PromotedReg2 = RegInfo.createVirtualRegister(RC);
BuildMI(BB, DL, TII.get(BPF::MOV_32_64), PromotedReg0).addReg(Reg);
BuildMI(BB, DL, TII.get(BPF::SLL_ri), PromotedReg1)
.addReg(PromotedReg0).addImm(32);
@@ -699,7 +698,7 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
report_fatal_error("unimplemented select CondCode " + Twine(CC));
}
- unsigned LHS = MI.getOperand(1).getReg();
+ Register LHS = MI.getOperand(1).getReg();
bool isSignedCmp = (CC == ISD::SETGT ||
CC == ISD::SETGE ||
CC == ISD::SETLT ||
@@ -716,7 +715,7 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
LHS = EmitSubregExt(MI, BB, LHS, isSignedCmp);
if (isSelectRROp) {
- unsigned RHS = MI.getOperand(2).getReg();
+ Register RHS = MI.getOperand(2).getReg();
if (is32BitCmp && !HasJmp32)
RHS = EmitSubregExt(MI, BB, RHS, isSignedCmp);
diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp
index 932f718d5490..6de3a4084d3d 100644
--- a/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/lib/Target/BPF/BPFInstrInfo.cpp
@@ -43,11 +43,11 @@ void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
void BPFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
- unsigned DstReg = MI->getOperand(0).getReg();
- unsigned SrcReg = MI->getOperand(1).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
+ Register SrcReg = MI->getOperand(1).getReg();
uint64_t CopyLen = MI->getOperand(2).getImm();
uint64_t Alignment = MI->getOperand(3).getImm();
- unsigned ScratchReg = MI->getOperand(4).getReg();
+ Register ScratchReg = MI->getOperand(4).getReg();
MachineBasicBlock *BB = MI->getParent();
DebugLoc dl = MI->getDebugLoc();
unsigned LdOpc, StOpc;
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index c44702a78ec8..ae5a82a99303 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -473,7 +473,7 @@ class CALL<string OpcodeStr>
class CALLX<string OpcodeStr>
: TYPE_ALU_JMP<BPF_CALL.Value, BPF_X.Value,
(outs),
- (ins calltarget:$BrDst),
+ (ins GPR:$BrDst),
!strconcat(OpcodeStr, " $BrDst"),
[]> {
bits<32> BrDst;
diff --git a/lib/Target/BPF/BPFMIChecking.cpp b/lib/Target/BPF/BPFMIChecking.cpp
index 4c46289656b4..f82f166eda4d 100644
--- a/lib/Target/BPF/BPFMIChecking.cpp
+++ b/lib/Target/BPF/BPFMIChecking.cpp
@@ -19,6 +19,7 @@
#include "BPFTargetMachine.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
using namespace llvm;
diff --git a/lib/Target/BPF/BPFMIPeephole.cpp b/lib/Target/BPF/BPFMIPeephole.cpp
index 156ba793e359..e9eecc55c3c3 100644
--- a/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/lib/Target/BPF/BPFMIPeephole.cpp
@@ -26,6 +26,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
using namespace llvm;
@@ -71,7 +72,7 @@ void BPFMIPeephole::initialize(MachineFunction &MFParm) {
MF = &MFParm;
MRI = &MF->getRegInfo();
TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
- LLVM_DEBUG(dbgs() << "*** BPF MachineSSA peephole pass ***\n\n");
+ LLVM_DEBUG(dbgs() << "*** BPF MachineSSA ZEXT Elim peephole pass ***\n\n");
}
bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI)
@@ -104,10 +105,10 @@ bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI)
if (!opnd.isReg())
return false;
- unsigned Reg = opnd.getReg();
- if ((TargetRegisterInfo::isVirtualRegister(Reg) &&
+ Register Reg = opnd.getReg();
+ if ((Register::isVirtualRegister(Reg) &&
MRI->getRegClass(Reg) == &BPF::GPRRegClass))
- return false;
+ return false;
}
LLVM_DEBUG(dbgs() << " One ZExt elim sequence identified.\n");
@@ -134,8 +135,8 @@ bool BPFMIPeephole::eliminateZExtSeq(void) {
// SRL_ri rB, rB, 32
if (MI.getOpcode() == BPF::SRL_ri &&
MI.getOperand(2).getImm() == 32) {
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned ShfReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register ShfReg = MI.getOperand(1).getReg();
MachineInstr *SllMI = MRI->getVRegDef(ShfReg);
LLVM_DEBUG(dbgs() << "Starting SRL found:");
@@ -159,7 +160,7 @@ bool BPFMIPeephole::eliminateZExtSeq(void) {
LLVM_DEBUG(dbgs() << " Type cast Mov found:");
LLVM_DEBUG(MovMI->dump());
- unsigned SubReg = MovMI->getOperand(1).getReg();
+ Register SubReg = MovMI->getOperand(1).getReg();
if (!isMovFrom32Def(MovMI)) {
LLVM_DEBUG(dbgs()
<< " One ZExt elim sequence failed qualifying elim.\n");
@@ -186,7 +187,8 @@ bool BPFMIPeephole::eliminateZExtSeq(void) {
} // end default namespace
INITIALIZE_PASS(BPFMIPeephole, DEBUG_TYPE,
- "BPF MachineSSA Peephole Optimization", false, false)
+ "BPF MachineSSA Peephole Optimization For ZEXT Eliminate",
+ false, false)
char BPFMIPeephole::ID = 0;
FunctionPass* llvm::createBPFMIPeepholePass() { return new BPFMIPeephole(); }
@@ -253,12 +255,16 @@ bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) {
// enabled. The special type cast insn MOV_32_64 involves different
// register class on src (i32) and dst (i64), RA could generate useless
// instruction due to this.
- if (MI.getOpcode() == BPF::MOV_32_64) {
- unsigned dst = MI.getOperand(0).getReg();
- unsigned dst_sub = TRI->getSubReg(dst, BPF::sub_32);
- unsigned src = MI.getOperand(1).getReg();
+ unsigned Opcode = MI.getOpcode();
+ if (Opcode == BPF::MOV_32_64 ||
+ Opcode == BPF::MOV_rr || Opcode == BPF::MOV_rr_32) {
+ Register dst = MI.getOperand(0).getReg();
+ Register src = MI.getOperand(1).getReg();
+
+ if (Opcode == BPF::MOV_32_64)
+ dst = TRI->getSubReg(dst, BPF::sub_32);
- if (dst_sub != src)
+ if (dst != src)
continue;
ToErase = &MI;
@@ -281,3 +287,177 @@ FunctionPass* llvm::createBPFMIPreEmitPeepholePass()
{
return new BPFMIPreEmitPeephole();
}
+
+STATISTIC(TruncElemNum, "Number of truncation eliminated");
+
+namespace {
+
+struct BPFMIPeepholeTruncElim : public MachineFunctionPass {
+
+ static char ID;
+ const BPFInstrInfo *TII;
+ MachineFunction *MF;
+ MachineRegisterInfo *MRI;
+
+ BPFMIPeepholeTruncElim() : MachineFunctionPass(ID) {
+ initializeBPFMIPeepholeTruncElimPass(*PassRegistry::getPassRegistry());
+ }
+
+private:
+ // Initialize class variables.
+ void initialize(MachineFunction &MFParm);
+
+ bool eliminateTruncSeq(void);
+
+public:
+
+ // Main entry point for this pass.
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ initialize(MF);
+
+ return eliminateTruncSeq();
+ }
+};
+
+static bool TruncSizeCompatible(int TruncSize, unsigned opcode)
+{
+ if (TruncSize == 1)
+ return opcode == BPF::LDB || opcode == BPF::LDB32;
+
+ if (TruncSize == 2)
+ return opcode == BPF::LDH || opcode == BPF::LDH32;
+
+ if (TruncSize == 4)
+ return opcode == BPF::LDW || opcode == BPF::LDW32;
+
+ return false;
+}
+
+// Initialize class variables.
+void BPFMIPeepholeTruncElim::initialize(MachineFunction &MFParm) {
+ MF = &MFParm;
+ MRI = &MF->getRegInfo();
+ TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
+ LLVM_DEBUG(dbgs() << "*** BPF MachineSSA TRUNC Elim peephole pass ***\n\n");
+}
+
+// Reg truncating is often the result of 8/16/32bit->64bit or
+// 8/16bit->32bit conversion. If the reg value is loaded with
+// masked byte width, the AND operation can be removed since
+// BPF LOAD already has zero extension.
+//
+// This also solved a correctness issue.
+// In BPF socket-related program, e.g., __sk_buff->{data, data_end}
+// are 32-bit registers, but later on, kernel verifier will rewrite
+// it with 64-bit value. Therefore, truncating the value after the
+// load will result in incorrect code.
+bool BPFMIPeepholeTruncElim::eliminateTruncSeq(void) {
+ MachineInstr* ToErase = nullptr;
+ bool Eliminated = false;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
+ // The second insn to remove if the eliminate candidate is a pair.
+ MachineInstr *MI2 = nullptr;
+ Register DstReg, SrcReg;
+ MachineInstr *DefMI;
+ int TruncSize = -1;
+
+ // If the previous instruction was marked for elimination, remove it now.
+ if (ToErase) {
+ ToErase->eraseFromParent();
+ ToErase = nullptr;
+ }
+
+ // AND A, 0xFFFFFFFF will be turned into SLL/SRL pair due to immediate
+ // for BPF ANDI is i32, and this case only happens on ALU64.
+ if (MI.getOpcode() == BPF::SRL_ri &&
+ MI.getOperand(2).getImm() == 32) {
+ SrcReg = MI.getOperand(1).getReg();
+ MI2 = MRI->getVRegDef(SrcReg);
+ DstReg = MI.getOperand(0).getReg();
+
+ if (!MI2 ||
+ MI2->getOpcode() != BPF::SLL_ri ||
+ MI2->getOperand(2).getImm() != 32)
+ continue;
+
+ // Update SrcReg.
+ SrcReg = MI2->getOperand(1).getReg();
+ DefMI = MRI->getVRegDef(SrcReg);
+ if (DefMI)
+ TruncSize = 4;
+ } else if (MI.getOpcode() == BPF::AND_ri ||
+ MI.getOpcode() == BPF::AND_ri_32) {
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ DefMI = MRI->getVRegDef(SrcReg);
+
+ if (!DefMI)
+ continue;
+
+ int64_t imm = MI.getOperand(2).getImm();
+ if (imm == 0xff)
+ TruncSize = 1;
+ else if (imm == 0xffff)
+ TruncSize = 2;
+ }
+
+ if (TruncSize == -1)
+ continue;
+
+ // The definition is PHI node, check all inputs.
+ if (DefMI->isPHI()) {
+ bool CheckFail = false;
+
+ for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
+ MachineOperand &opnd = DefMI->getOperand(i);
+ if (!opnd.isReg()) {
+ CheckFail = true;
+ break;
+ }
+
+ MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg());
+ if (!PhiDef || PhiDef->isPHI() ||
+ !TruncSizeCompatible(TruncSize, PhiDef->getOpcode())) {
+ CheckFail = true;
+ break;
+ }
+ }
+
+ if (CheckFail)
+ continue;
+ } else if (!TruncSizeCompatible(TruncSize, DefMI->getOpcode())) {
+ continue;
+ }
+
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::MOV_rr), DstReg)
+ .addReg(SrcReg);
+
+ if (MI2)
+ MI2->eraseFromParent();
+
+ // Mark it to ToErase, and erase in the next iteration.
+ ToErase = &MI;
+ TruncElemNum++;
+ Eliminated = true;
+ }
+ }
+
+ return Eliminated;
+}
+
+} // end default namespace
+
+INITIALIZE_PASS(BPFMIPeepholeTruncElim, "bpf-mi-trunc-elim",
+ "BPF MachineSSA Peephole Optimization For TRUNC Eliminate",
+ false, false)
+
+char BPFMIPeepholeTruncElim::ID = 0;
+FunctionPass* llvm::createBPFMIPeepholeTruncElimPass()
+{
+ return new BPFMIPeepholeTruncElim();
+}
diff --git a/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/lib/Target/BPF/BPFMISimplifyPatchable.cpp
index e9114d7187e3..9c689aed6417 100644
--- a/lib/Target/BPF/BPFMISimplifyPatchable.cpp
+++ b/lib/Target/BPF/BPFMISimplifyPatchable.cpp
@@ -11,19 +11,15 @@
// ldd r2, r1, 0
// add r3, struct_base_reg, r2
//
-// Here @global should either present a AMA (abstruct member access) or
-// a patchable extern variable. And these two kinds of accesses
-// are subject to bpf load time patching. After this pass, the
+// Here @global should represent an AMA (abstruct member access).
+// Such an access is subject to bpf load time patching. After this pass, the
// code becomes
// ld_imm64 r1, @global
// add r3, struct_base_reg, r1
//
// Eventually, at BTF output stage, a relocation record will be generated
// for ld_imm64 which should be replaced later by bpf loader:
-// r1 = <calculated offset> or <to_be_patched_extern_val>
-// add r3, struct_base_reg, r1
-// or
-// ld_imm64 r1, <to_be_patched_extern_val>
+// r1 = <calculated field_info>
// add r3, struct_base_reg, r1
//
//===----------------------------------------------------------------------===//
@@ -34,6 +30,7 @@
#include "BPFTargetMachine.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
using namespace llvm;
@@ -100,9 +97,8 @@ bool BPFMISimplifyPatchable::removeLD() {
if (!MI.getOperand(2).isImm() || MI.getOperand(2).getImm())
continue;
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
- int64_t ImmVal = MI.getOperand(2).getImm();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
MachineInstr *DefInst = MRI->getUniqueVRegDef(SrcReg);
if (!DefInst)
@@ -118,17 +114,8 @@ bool BPFMISimplifyPatchable::removeLD() {
// Global variables representing structure offset or
// patchable extern globals.
if (GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
- assert(ImmVal == 0);
+ assert(MI.getOperand(2).getImm() == 0);
IsCandidate = true;
- } else if (!GVar->hasInitializer() && GVar->hasExternalLinkage() &&
- GVar->getSection() ==
- BPFCoreSharedInfo::PatchableExtSecName) {
- if (ImmVal == 0)
- IsCandidate = true;
- else
- errs() << "WARNING: unhandled patchable extern "
- << GVar->getName() << " with load offset " << ImmVal
- << "\n";
}
}
}
diff --git a/lib/Target/BPF/BPFRegisterInfo.cpp b/lib/Target/BPF/BPFRegisterInfo.cpp
index 714af06e11d9..8de81a469b84 100644
--- a/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -77,7 +77,7 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
}
- unsigned FrameReg = getFrameRegister(MF);
+ Register FrameReg = getFrameRegister(MF);
int FrameIndex = MI.getOperand(i).getIndex();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
@@ -86,7 +86,7 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
WarnSize(Offset, MF, DL);
MI.getOperand(i).ChangeToRegister(FrameReg, false);
- unsigned reg = MI.getOperand(i - 1).getReg();
+ Register reg = MI.getOperand(i - 1).getReg();
BuildMI(MBB, ++II, DL, TII.get(BPF::ADD_ri), reg)
.addReg(reg)
.addImm(Offset);
@@ -105,7 +105,7 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// architecture does not really support FI_ri, replace it with
// MOV_rr <target_reg>, frame_reg
// ADD_ri <target_reg>, imm
- unsigned reg = MI.getOperand(i - 1).getReg();
+ Register reg = MI.getOperand(i - 1).getReg();
BuildMI(MBB, ++II, DL, TII.get(BPF::MOV_rr), reg)
.addReg(FrameReg);
diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp
index 24c0ff0f7f15..0c4f2c74e7a4 100644
--- a/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/lib/Target/BPF/BPFTargetMachine.cpp
@@ -36,6 +36,7 @@ extern "C" void LLVMInitializeBPFTarget() {
PassRegistry &PR = *PassRegistry::getPassRegistry();
initializeBPFAbstractMemberAccessPass(PR);
initializeBPFMIPeepholePass(PR);
+ initializeBPFMIPeepholeTruncElimPass(PR);
}
// DataLayout: little or big endian
@@ -61,7 +62,7 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
: LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
getEffectiveRelocModel(RM),
getEffectiveCodeModel(CM, CodeModel::Small), OL),
- TLOF(make_unique<TargetLoweringObjectFileELF>()),
+ TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
Subtarget(TT, CPU, FS, *this) {
initAsmInfo();
@@ -94,7 +95,7 @@ TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
void BPFPassConfig::addIRPasses() {
- addPass(createBPFAbstractMemberAccess());
+ addPass(createBPFAbstractMemberAccess(&getBPFTargetMachine()));
TargetPassConfig::addIRPasses();
}
@@ -115,15 +116,16 @@ void BPFPassConfig::addMachineSSAOptimization() {
TargetPassConfig::addMachineSSAOptimization();
const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl();
- if (Subtarget->getHasAlu32() && !DisableMIPeephole)
- addPass(createBPFMIPeepholePass());
+ if (!DisableMIPeephole) {
+ if (Subtarget->getHasAlu32())
+ addPass(createBPFMIPeepholePass());
+ addPass(createBPFMIPeepholeTruncElimPass());
+ }
}
void BPFPassConfig::addPreEmitPass() {
- const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl();
-
addPass(createBPFMIPreEmitCheckingPass());
if (getOptLevel() != CodeGenOpt::None)
- if (Subtarget->getHasAlu32() && !DisableMIPeephole)
+ if (!DisableMIPeephole)
addPass(createBPFMIPreEmitPeepholePass());
}
diff --git a/lib/Target/BPF/BTF.h b/lib/Target/BPF/BTF.h
index ad56716710a6..a13c862bf840 100644
--- a/lib/Target/BPF/BTF.h
+++ b/lib/Target/BPF/BTF.h
@@ -17,7 +17,7 @@
///
/// The binary layout for .BTF.ext section:
/// struct ExtHeader
-/// FuncInfo, LineInfo, OffsetReloc and ExternReloc subsections
+/// FuncInfo, LineInfo, FieldReloc and ExternReloc subsections
/// The FuncInfo subsection is defined as below:
/// BTFFuncInfo Size
/// struct SecFuncInfo for ELF section #1
@@ -32,19 +32,12 @@
/// struct SecLineInfo for ELF section #2
/// A number of struct BPFLineInfo for ELF section #2
/// ...
-/// The OffsetReloc subsection is defined as below:
-/// BPFOffsetReloc Size
-/// struct SecOffsetReloc for ELF section #1
-/// A number of struct BPFOffsetReloc for ELF section #1
-/// struct SecOffsetReloc for ELF section #2
-/// A number of struct BPFOffsetReloc for ELF section #2
-/// ...
-/// The ExternReloc subsection is defined as below:
-/// BPFExternReloc Size
-/// struct SecExternReloc for ELF section #1
-/// A number of struct BPFExternReloc for ELF section #1
-/// struct SecExternReloc for ELF section #2
-/// A number of struct BPFExternReloc for ELF section #2
+/// The FieldReloc subsection is defined as below:
+/// BPFFieldReloc Size
+/// struct SecFieldReloc for ELF section #1
+/// A number of struct BPFFieldReloc for ELF section #1
+/// struct SecFieldReloc for ELF section #2
+/// A number of struct BPFFieldReloc for ELF section #2
/// ...
///
/// The section formats are also defined at
@@ -63,7 +56,7 @@ enum : uint32_t { MAGIC = 0xeB9F, VERSION = 1 };
/// Sizes in bytes of various things in the BTF format.
enum {
HeaderSize = 24,
- ExtHeaderSize = 40,
+ ExtHeaderSize = 32,
CommonTypeSize = 12,
BTFArraySize = 12,
BTFEnumSize = 8,
@@ -72,12 +65,10 @@ enum {
BTFDataSecVarSize = 12,
SecFuncInfoSize = 8,
SecLineInfoSize = 8,
- SecOffsetRelocSize = 8,
- SecExternRelocSize = 8,
+ SecFieldRelocSize = 8,
BPFFuncInfoSize = 8,
BPFLineInfoSize = 16,
- BPFOffsetRelocSize = 12,
- BPFExternRelocSize = 8,
+ BPFFieldRelocSize = 16,
};
/// The .BTF section header definition.
@@ -213,10 +204,8 @@ struct ExtHeader {
uint32_t FuncInfoLen; ///< Length of func info section
uint32_t LineInfoOff; ///< Offset of line info section
uint32_t LineInfoLen; ///< Length of line info section
- uint32_t OffsetRelocOff; ///< Offset of offset reloc section
- uint32_t OffsetRelocLen; ///< Length of offset reloc section
- uint32_t ExternRelocOff; ///< Offset of extern reloc section
- uint32_t ExternRelocLen; ///< Length of extern reloc section
+ uint32_t FieldRelocOff; ///< Offset of offset reloc section
+ uint32_t FieldRelocLen; ///< Length of offset reloc section
};
/// Specifying one function info.
@@ -247,28 +236,17 @@ struct SecLineInfo {
};
/// Specifying one offset relocation.
-struct BPFOffsetReloc {
+struct BPFFieldReloc {
uint32_t InsnOffset; ///< Byte offset in this section
uint32_t TypeID; ///< TypeID for the relocation
uint32_t OffsetNameOff; ///< The string to traverse types
+ uint32_t RelocKind; ///< What to patch the instruction
};
/// Specifying offset relocation's in one section.
-struct SecOffsetReloc {
- uint32_t SecNameOff; ///< Section name index in the .BTF string table
- uint32_t NumOffsetReloc; ///< Number of offset reloc's in this section
-};
-
-/// Specifying one offset relocation.
-struct BPFExternReloc {
- uint32_t InsnOffset; ///< Byte offset in this section
- uint32_t ExternNameOff; ///< The string for external variable
-};
-
-/// Specifying extern relocation's in one section.
-struct SecExternReloc {
+struct SecFieldReloc {
uint32_t SecNameOff; ///< Section name index in the .BTF string table
- uint32_t NumExternReloc; ///< Number of extern reloc's in this section
+ uint32_t NumFieldReloc; ///< Number of offset reloc's in this section
};
} // End namespace BTF.
diff --git a/lib/Target/BPF/BTFDebug.cpp b/lib/Target/BPF/BTFDebug.cpp
index fa35c6619e21..db551e739bd7 100644
--- a/lib/Target/BPF/BTFDebug.cpp
+++ b/lib/Target/BPF/BTFDebug.cpp
@@ -184,9 +184,7 @@ void BTFTypeEnum::emitType(MCStreamer &OS) {
}
}
-BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize,
- uint32_t NumElems)
- : ElemSize(ElemSize) {
+BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t NumElems) {
Kind = BTF::BTF_KIND_ARRAY;
BTFType.NameOff = 0;
BTFType.Info = Kind << 24;
@@ -216,12 +214,6 @@ void BTFTypeArray::emitType(MCStreamer &OS) {
OS.EmitIntValue(ArrayInfo.Nelems, 4);
}
-void BTFTypeArray::getLocInfo(uint32_t Loc, uint32_t &LocOffset,
- uint32_t &ElementTypeId) {
- ElementTypeId = ArrayInfo.ElemType;
- LocOffset = Loc * ElemSize;
-}
-
/// Represent either a struct or a union.
BTFTypeStruct::BTFTypeStruct(const DICompositeType *STy, bool IsStruct,
bool HasBitField, uint32_t Vlen)
@@ -251,7 +243,8 @@ void BTFTypeStruct::completeType(BTFDebug &BDebug) {
} else {
BTFMember.Offset = DDTy->getOffsetInBits();
}
- BTFMember.Type = BDebug.getTypeId(DDTy->getBaseType());
+ const auto *BaseTy = DDTy->getBaseType();
+ BTFMember.Type = BDebug.getTypeId(BaseTy);
Members.push_back(BTFMember);
}
}
@@ -268,15 +261,6 @@ void BTFTypeStruct::emitType(MCStreamer &OS) {
std::string BTFTypeStruct::getName() { return STy->getName(); }
-void BTFTypeStruct::getMemberInfo(uint32_t Loc, uint32_t &MemberOffset,
- uint32_t &MemberType) {
- MemberType = Members[Loc].Type;
- MemberOffset =
- HasBitField ? Members[Loc].Offset & 0xffffff : Members[Loc].Offset;
-}
-
-uint32_t BTFTypeStruct::getStructSize() { return STy->getSizeInBits() >> 3; }
-
/// The Func kind represents both subprogram and pointee of function
/// pointers. If the FuncName is empty, it represents a pointee of function
/// pointer. Otherwise, it represents a subprogram. The func arg names
@@ -428,7 +412,7 @@ void BTFDebug::visitBasicType(const DIBasicType *BTy, uint32_t &TypeId) {
// Create a BTF type instance for this DIBasicType and put it into
// DIToIdMap for cross-type reference check.
- auto TypeEntry = llvm::make_unique<BTFTypeInt>(
+ auto TypeEntry = std::make_unique<BTFTypeInt>(
Encoding, BTy->getSizeInBits(), BTy->getOffsetInBits(), BTy->getName());
TypeId = addType(std::move(TypeEntry), BTy);
}
@@ -447,7 +431,7 @@ void BTFDebug::visitSubroutineType(
// a function pointer has an empty name. The subprogram type will
// not be added to DIToIdMap as it should not be referenced by
// any other types.
- auto TypeEntry = llvm::make_unique<BTFTypeFuncProto>(STy, VLen, FuncArgNames);
+ auto TypeEntry = std::make_unique<BTFTypeFuncProto>(STy, VLen, FuncArgNames);
if (ForSubprog)
TypeId = addType(std::move(TypeEntry)); // For subprogram
else
@@ -478,7 +462,7 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
}
auto TypeEntry =
- llvm::make_unique<BTFTypeStruct>(CTy, IsStruct, HasBitField, VLen);
+ std::make_unique<BTFTypeStruct>(CTy, IsStruct, HasBitField, VLen);
StructTypes.push_back(TypeEntry.get());
TypeId = addType(std::move(TypeEntry), CTy);
@@ -489,35 +473,29 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) {
// Visit array element type.
- uint32_t ElemTypeId, ElemSize;
+ uint32_t ElemTypeId;
const DIType *ElemType = CTy->getBaseType();
visitTypeEntry(ElemType, ElemTypeId, false, false);
- ElemSize = ElemType->getSizeInBits() >> 3;
- if (!CTy->getSizeInBits()) {
- auto TypeEntry = llvm::make_unique<BTFTypeArray>(ElemTypeId, 0, 0);
- ArrayTypes.push_back(TypeEntry.get());
- ElemTypeId = addType(std::move(TypeEntry), CTy);
- } else {
- // Visit array dimensions.
- DINodeArray Elements = CTy->getElements();
- for (int I = Elements.size() - 1; I >= 0; --I) {
- if (auto *Element = dyn_cast_or_null<DINode>(Elements[I]))
- if (Element->getTag() == dwarf::DW_TAG_subrange_type) {
- const DISubrange *SR = cast<DISubrange>(Element);
- auto *CI = SR->getCount().dyn_cast<ConstantInt *>();
- int64_t Count = CI->getSExtValue();
-
- auto TypeEntry =
- llvm::make_unique<BTFTypeArray>(ElemTypeId, ElemSize, Count);
- ArrayTypes.push_back(TypeEntry.get());
- if (I == 0)
- ElemTypeId = addType(std::move(TypeEntry), CTy);
- else
- ElemTypeId = addType(std::move(TypeEntry));
- ElemSize = ElemSize * Count;
- }
- }
+ // Visit array dimensions.
+ DINodeArray Elements = CTy->getElements();
+ for (int I = Elements.size() - 1; I >= 0; --I) {
+ if (auto *Element = dyn_cast_or_null<DINode>(Elements[I]))
+ if (Element->getTag() == dwarf::DW_TAG_subrange_type) {
+ const DISubrange *SR = cast<DISubrange>(Element);
+ auto *CI = SR->getCount().dyn_cast<ConstantInt *>();
+ int64_t Count = CI->getSExtValue();
+
+ // For struct s { int b; char c[]; }, the c[] will be represented
+ // as an array with Count = -1.
+ auto TypeEntry =
+ std::make_unique<BTFTypeArray>(ElemTypeId,
+ Count >= 0 ? Count : 0);
+ if (I == 0)
+ ElemTypeId = addType(std::move(TypeEntry), CTy);
+ else
+ ElemTypeId = addType(std::move(TypeEntry));
+ }
}
// The array TypeId is the type id of the outermost dimension.
@@ -526,7 +504,7 @@ void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) {
// The IR does not have a type for array index while BTF wants one.
// So create an array index type if there is none.
if (!ArrayIndexTypeId) {
- auto TypeEntry = llvm::make_unique<BTFTypeInt>(dwarf::DW_ATE_unsigned, 32,
+ auto TypeEntry = std::make_unique<BTFTypeInt>(dwarf::DW_ATE_unsigned, 32,
0, "__ARRAY_SIZE_TYPE__");
ArrayIndexTypeId = addType(std::move(TypeEntry));
}
@@ -538,7 +516,7 @@ void BTFDebug::visitEnumType(const DICompositeType *CTy, uint32_t &TypeId) {
if (VLen > BTF::MAX_VLEN)
return;
- auto TypeEntry = llvm::make_unique<BTFTypeEnum>(CTy, VLen);
+ auto TypeEntry = std::make_unique<BTFTypeEnum>(CTy, VLen);
TypeId = addType(std::move(TypeEntry), CTy);
// No need to visit base type as BTF does not encode it.
}
@@ -546,7 +524,7 @@ void BTFDebug::visitEnumType(const DICompositeType *CTy, uint32_t &TypeId) {
/// Handle structure/union forward declarations.
void BTFDebug::visitFwdDeclType(const DICompositeType *CTy, bool IsUnion,
uint32_t &TypeId) {
- auto TypeEntry = llvm::make_unique<BTFTypeFwd>(CTy->getName(), IsUnion);
+ auto TypeEntry = std::make_unique<BTFTypeFwd>(CTy->getName(), IsUnion);
TypeId = addType(std::move(TypeEntry), CTy);
}
@@ -588,7 +566,7 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
/// Find a candidate, generate a fixup. Later on the struct/union
/// pointee type will be replaced with either a real type or
/// a forward declaration.
- auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag, true);
+ auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, true);
auto &Fixup = FixupDerivedTypes[CTy->getName()];
Fixup.first = CTag == dwarf::DW_TAG_union_type;
Fixup.second.push_back(TypeEntry.get());
@@ -602,7 +580,7 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
if (Tag == dwarf::DW_TAG_pointer_type || Tag == dwarf::DW_TAG_typedef ||
Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type ||
Tag == dwarf::DW_TAG_restrict_type) {
- auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag, false);
+ auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, false);
TypeId = addType(std::move(TypeEntry), DTy);
} else if (Tag != dwarf::DW_TAG_member) {
return;
@@ -669,7 +647,7 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
}
auto TypeEntry =
- llvm::make_unique<BTFTypeStruct>(CTy, true, HasBitField, Elements.size());
+ std::make_unique<BTFTypeStruct>(CTy, true, HasBitField, Elements.size());
StructTypes.push_back(TypeEntry.get());
TypeId = addType(std::move(TypeEntry), CTy);
@@ -774,9 +752,10 @@ void BTFDebug::emitBTFSection() {
}
void BTFDebug::emitBTFExtSection() {
- // Do not emit section if empty FuncInfoTable and LineInfoTable.
+ // Do not emit section if empty FuncInfoTable and LineInfoTable
+ // and FieldRelocTable.
if (!FuncInfoTable.size() && !LineInfoTable.size() &&
- !OffsetRelocTable.size() && !ExternRelocTable.size())
+ !FieldRelocTable.size())
return;
MCContext &Ctx = OS.getContext();
@@ -788,8 +767,8 @@ void BTFDebug::emitBTFExtSection() {
// Account for FuncInfo/LineInfo record size as well.
uint32_t FuncLen = 4, LineLen = 4;
- // Do not account for optional OffsetReloc/ExternReloc.
- uint32_t OffsetRelocLen = 0, ExternRelocLen = 0;
+ // Do not account for optional FieldReloc.
+ uint32_t FieldRelocLen = 0;
for (const auto &FuncSec : FuncInfoTable) {
FuncLen += BTF::SecFuncInfoSize;
FuncLen += FuncSec.second.size() * BTF::BPFFuncInfoSize;
@@ -798,28 +777,20 @@ void BTFDebug::emitBTFExtSection() {
LineLen += BTF::SecLineInfoSize;
LineLen += LineSec.second.size() * BTF::BPFLineInfoSize;
}
- for (const auto &OffsetRelocSec : OffsetRelocTable) {
- OffsetRelocLen += BTF::SecOffsetRelocSize;
- OffsetRelocLen += OffsetRelocSec.second.size() * BTF::BPFOffsetRelocSize;
- }
- for (const auto &ExternRelocSec : ExternRelocTable) {
- ExternRelocLen += BTF::SecExternRelocSize;
- ExternRelocLen += ExternRelocSec.second.size() * BTF::BPFExternRelocSize;
+ for (const auto &FieldRelocSec : FieldRelocTable) {
+ FieldRelocLen += BTF::SecFieldRelocSize;
+ FieldRelocLen += FieldRelocSec.second.size() * BTF::BPFFieldRelocSize;
}
- if (OffsetRelocLen)
- OffsetRelocLen += 4;
- if (ExternRelocLen)
- ExternRelocLen += 4;
+ if (FieldRelocLen)
+ FieldRelocLen += 4;
OS.EmitIntValue(0, 4);
OS.EmitIntValue(FuncLen, 4);
OS.EmitIntValue(FuncLen, 4);
OS.EmitIntValue(LineLen, 4);
OS.EmitIntValue(FuncLen + LineLen, 4);
- OS.EmitIntValue(OffsetRelocLen, 4);
- OS.EmitIntValue(FuncLen + LineLen + OffsetRelocLen, 4);
- OS.EmitIntValue(ExternRelocLen, 4);
+ OS.EmitIntValue(FieldRelocLen, 4);
// Emit func_info table.
OS.AddComment("FuncInfo");
@@ -853,35 +824,20 @@ void BTFDebug::emitBTFExtSection() {
}
}
- // Emit offset reloc table.
- if (OffsetRelocLen) {
- OS.AddComment("OffsetReloc");
- OS.EmitIntValue(BTF::BPFOffsetRelocSize, 4);
- for (const auto &OffsetRelocSec : OffsetRelocTable) {
- OS.AddComment("Offset reloc section string offset=" +
- std::to_string(OffsetRelocSec.first));
- OS.EmitIntValue(OffsetRelocSec.first, 4);
- OS.EmitIntValue(OffsetRelocSec.second.size(), 4);
- for (const auto &OffsetRelocInfo : OffsetRelocSec.second) {
- Asm->EmitLabelReference(OffsetRelocInfo.Label, 4);
- OS.EmitIntValue(OffsetRelocInfo.TypeID, 4);
- OS.EmitIntValue(OffsetRelocInfo.OffsetNameOff, 4);
- }
- }
- }
-
- // Emit extern reloc table.
- if (ExternRelocLen) {
- OS.AddComment("ExternReloc");
- OS.EmitIntValue(BTF::BPFExternRelocSize, 4);
- for (const auto &ExternRelocSec : ExternRelocTable) {
- OS.AddComment("Extern reloc section string offset=" +
- std::to_string(ExternRelocSec.first));
- OS.EmitIntValue(ExternRelocSec.first, 4);
- OS.EmitIntValue(ExternRelocSec.second.size(), 4);
- for (const auto &ExternRelocInfo : ExternRelocSec.second) {
- Asm->EmitLabelReference(ExternRelocInfo.Label, 4);
- OS.EmitIntValue(ExternRelocInfo.ExternNameOff, 4);
+ // Emit field reloc table.
+ if (FieldRelocLen) {
+ OS.AddComment("FieldReloc");
+ OS.EmitIntValue(BTF::BPFFieldRelocSize, 4);
+ for (const auto &FieldRelocSec : FieldRelocTable) {
+ OS.AddComment("Field reloc section string offset=" +
+ std::to_string(FieldRelocSec.first));
+ OS.EmitIntValue(FieldRelocSec.first, 4);
+ OS.EmitIntValue(FieldRelocSec.second.size(), 4);
+ for (const auto &FieldRelocInfo : FieldRelocSec.second) {
+ Asm->EmitLabelReference(FieldRelocInfo.Label, 4);
+ OS.EmitIntValue(FieldRelocInfo.TypeID, 4);
+ OS.EmitIntValue(FieldRelocInfo.OffsetNameOff, 4);
+ OS.EmitIntValue(FieldRelocInfo.RelocKind, 4);
}
}
}
@@ -942,7 +898,7 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
// Construct subprogram func type
auto FuncTypeEntry =
- llvm::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId);
+ std::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId);
uint32_t FuncTypeId = addType(std::move(FuncTypeEntry));
for (const auto &TypeEntry : TypeEntries)
@@ -980,71 +936,27 @@ unsigned BTFDebug::populateStructType(const DIType *Ty) {
return Id;
}
-// Find struct/array debuginfo types given a type id.
-void BTFDebug::setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType,
- BTFTypeArray **PrevArrayType) {
- for (const auto &StructType : StructTypes) {
- if (StructType->getId() == TypeId) {
- *PrevStructType = StructType;
- return;
- }
- }
- for (const auto &ArrayType : ArrayTypes) {
- if (ArrayType->getId() == TypeId) {
- *PrevArrayType = ArrayType;
- return;
- }
- }
-}
-
-/// Generate a struct member offset relocation.
-void BTFDebug::generateOffsetReloc(const MachineInstr *MI,
+/// Generate a struct member field relocation.
+void BTFDebug::generateFieldReloc(const MachineInstr *MI,
const MCSymbol *ORSym, DIType *RootTy,
StringRef AccessPattern) {
- BTFTypeStruct *PrevStructType = nullptr;
- BTFTypeArray *PrevArrayType = nullptr;
unsigned RootId = populateStructType(RootTy);
- setTypeFromId(RootId, &PrevStructType, &PrevArrayType);
- unsigned RootTySize = PrevStructType->getStructSize();
-
- BTFOffsetReloc OffsetReloc;
- OffsetReloc.Label = ORSym;
- OffsetReloc.OffsetNameOff = addString(AccessPattern.drop_back());
- OffsetReloc.TypeID = RootId;
-
- uint32_t Start = 0, End = 0, Offset = 0;
- bool FirstAccess = true;
- for (auto C : AccessPattern) {
- if (C != ':') {
- End++;
- } else {
- std::string SubStr = AccessPattern.substr(Start, End - Start);
- int Loc = std::stoi(SubStr);
-
- if (FirstAccess) {
- Offset = Loc * RootTySize;
- FirstAccess = false;
- } else if (PrevStructType) {
- uint32_t MemberOffset, MemberTypeId;
- PrevStructType->getMemberInfo(Loc, MemberOffset, MemberTypeId);
-
- Offset += MemberOffset >> 3;
- PrevStructType = nullptr;
- setTypeFromId(MemberTypeId, &PrevStructType, &PrevArrayType);
- } else if (PrevArrayType) {
- uint32_t LocOffset, ElementTypeId;
- PrevArrayType->getLocInfo(Loc, LocOffset, ElementTypeId);
-
- Offset += LocOffset;
- PrevArrayType = nullptr;
- setTypeFromId(ElementTypeId, &PrevStructType, &PrevArrayType);
- }
- Start = End + 1;
- End = Start;
- }
- }
- AccessOffsets[RootTy->getName().str() + ":" + AccessPattern.str()] = Offset;
- OffsetRelocTable[SecNameOff].push_back(OffsetReloc);
+ size_t FirstDollar = AccessPattern.find_first_of('$');
+ size_t FirstColon = AccessPattern.find_first_of(':');
+ size_t SecondColon = AccessPattern.find_first_of(':', FirstColon + 1);
+ StringRef IndexPattern = AccessPattern.substr(FirstDollar + 1);
+ StringRef RelocKindStr = AccessPattern.substr(FirstColon + 1,
+ SecondColon - FirstColon);
+ StringRef PatchImmStr = AccessPattern.substr(SecondColon + 1,
+ FirstDollar - SecondColon);
+
+ BTFFieldReloc FieldReloc;
+ FieldReloc.Label = ORSym;
+ FieldReloc.OffsetNameOff = addString(IndexPattern);
+ FieldReloc.TypeID = RootId;
+ FieldReloc.RelocKind = std::stoull(RelocKindStr);
+ PatchImms[AccessPattern.str()] = std::stoul(PatchImmStr);
+ FieldRelocTable[SecNameOff].push_back(FieldReloc);
}
void BTFDebug::processLDimm64(const MachineInstr *MI) {
@@ -1052,7 +964,7 @@ void BTFDebug::processLDimm64(const MachineInstr *MI) {
// will generate an .BTF.ext record.
//
// If the insn is "r2 = LD_imm64 @__BTF_...",
- // add this insn into the .BTF.ext OffsetReloc subsection.
+ // add this insn into the .BTF.ext FieldReloc subsection.
// Relocation looks like:
// . SecName:
// . InstOffset
@@ -1083,16 +995,7 @@ void BTFDebug::processLDimm64(const MachineInstr *MI) {
MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
DIType *Ty = dyn_cast<DIType>(MDN);
- generateOffsetReloc(MI, ORSym, Ty, GVar->getName());
- } else if (GVar && !GVar->hasInitializer() && GVar->hasExternalLinkage() &&
- GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) {
- MCSymbol *ORSym = OS.getContext().createTempSymbol();
- OS.EmitLabel(ORSym);
-
- BTFExternReloc ExternReloc;
- ExternReloc.Label = ORSym;
- ExternReloc.ExternNameOff = addString(GVar->getName());
- ExternRelocTable[SecNameOff].push_back(ExternReloc);
+ generateFieldReloc(MI, ORSym, Ty, GVar->getName());
}
}
}
@@ -1200,12 +1103,12 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
? BTF::VAR_GLOBAL_ALLOCATED
: BTF::VAR_STATIC;
auto VarEntry =
- llvm::make_unique<BTFKindVar>(Global.getName(), GVTypeId, GVarInfo);
+ std::make_unique<BTFKindVar>(Global.getName(), GVTypeId, GVarInfo);
uint32_t VarId = addType(std::move(VarEntry));
// Find or create a DataSec
if (DataSecEntries.find(SecName) == DataSecEntries.end()) {
- DataSecEntries[SecName] = llvm::make_unique<BTFKindDataSec>(Asm, SecName);
+ DataSecEntries[SecName] = std::make_unique<BTFKindDataSec>(Asm, SecName);
}
// Calculate symbol size
@@ -1224,30 +1127,12 @@ bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
const GlobalValue *GVal = MO.getGlobal();
auto *GVar = dyn_cast<GlobalVariable>(GVal);
if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
- MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
- DIType *Ty = dyn_cast<DIType>(MDN);
- std::string TypeName = Ty->getName();
- int64_t Imm = AccessOffsets[TypeName + ":" + GVar->getName().str()];
-
- // Emit "mov ri, <imm>" for abstract member accesses.
+ // Emit "mov ri, <imm>" for patched immediate.
+ uint32_t Imm = PatchImms[GVar->getName().str()];
OutMI.setOpcode(BPF::MOV_ri);
OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
OutMI.addOperand(MCOperand::createImm(Imm));
return true;
- } else if (GVar && !GVar->hasInitializer() &&
- GVar->hasExternalLinkage() &&
- GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) {
- const IntegerType *IntTy = dyn_cast<IntegerType>(GVar->getValueType());
- assert(IntTy);
- // For patchable externals, emit "LD_imm64, ri, 0" if the external
- // variable is 64bit width, emit "mov ri, 0" otherwise.
- if (IntTy->getBitWidth() == 64)
- OutMI.setOpcode(BPF::LD_imm64);
- else
- OutMI.setOpcode(BPF::MOV_ri);
- OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
- OutMI.addOperand(MCOperand::createImm(0));
- return true;
}
}
}
@@ -1281,7 +1166,7 @@ void BTFDebug::endModule() {
}
if (StructTypeId == 0) {
- auto FwdTypeEntry = llvm::make_unique<BTFTypeFwd>(TypeName, IsUnion);
+ auto FwdTypeEntry = std::make_unique<BTFTypeFwd>(TypeName, IsUnion);
StructTypeId = addType(std::move(FwdTypeEntry));
}
diff --git a/lib/Target/BPF/BTFDebug.h b/lib/Target/BPF/BTFDebug.h
index 6c0cdde17d9b..c01e0d1d1612 100644
--- a/lib/Target/BPF/BTFDebug.h
+++ b/lib/Target/BPF/BTFDebug.h
@@ -104,15 +104,13 @@ public:
/// Handle array type.
class BTFTypeArray : public BTFTypeBase {
- uint32_t ElemSize;
struct BTF::BTFArray ArrayInfo;
public:
- BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize, uint32_t NumElems);
+ BTFTypeArray(uint32_t ElemTypeId, uint32_t NumElems);
uint32_t getSize() { return BTFTypeBase::getSize() + BTF::BTFArraySize; }
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
- void getLocInfo(uint32_t Loc, uint32_t &LocOffset, uint32_t &ElementTypeId);
};
/// Handle struct/union type.
@@ -130,8 +128,6 @@ public:
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
std::string getName();
- void getMemberInfo(uint32_t Loc, uint32_t &Offset, uint32_t &MemberType);
- uint32_t getStructSize();
};
/// Handle function pointer.
@@ -199,7 +195,7 @@ class BTFStringTable {
/// A mapping from string table offset to the index
/// of the Table. It is used to avoid putting
/// duplicated strings in the table.
- std::unordered_map<uint32_t, uint32_t> OffsetToIdMap;
+ std::map<uint32_t, uint32_t> OffsetToIdMap;
/// A vector of strings to represent the string table.
std::vector<std::string> Table;
@@ -228,16 +224,11 @@ struct BTFLineInfo {
};
/// Represent one offset relocation.
-struct BTFOffsetReloc {
+struct BTFFieldReloc {
const MCSymbol *Label; ///< MCSymbol identifying insn for the reloc
uint32_t TypeID; ///< Type ID
uint32_t OffsetNameOff; ///< The string to traverse types
-};
-
-/// Represent one extern relocation.
-struct BTFExternReloc {
- const MCSymbol *Label; ///< MCSymbol identifying insn for the reloc
- uint32_t ExternNameOff; ///< The extern variable name
+ uint32_t RelocKind; ///< What to patch the instruction
};
/// Collect and emit BTF information.
@@ -253,13 +244,11 @@ class BTFDebug : public DebugHandlerBase {
std::unordered_map<const DIType *, uint32_t> DIToIdMap;
std::map<uint32_t, std::vector<BTFFuncInfo>> FuncInfoTable;
std::map<uint32_t, std::vector<BTFLineInfo>> LineInfoTable;
- std::map<uint32_t, std::vector<BTFOffsetReloc>> OffsetRelocTable;
- std::map<uint32_t, std::vector<BTFExternReloc>> ExternRelocTable;
+ std::map<uint32_t, std::vector<BTFFieldReloc>> FieldRelocTable;
StringMap<std::vector<std::string>> FileContent;
std::map<std::string, std::unique_ptr<BTFKindDataSec>> DataSecEntries;
std::vector<BTFTypeStruct *> StructTypes;
- std::vector<BTFTypeArray *> ArrayTypes;
- std::map<std::string, int64_t> AccessOffsets;
+ std::map<std::string, uint32_t> PatchImms;
std::map<StringRef, std::pair<bool, std::vector<BTFTypeDerived *>>>
FixupDerivedTypes;
@@ -305,13 +294,9 @@ class BTFDebug : public DebugHandlerBase {
void processGlobals(bool ProcessingMapDef);
/// Generate one offset relocation record.
- void generateOffsetReloc(const MachineInstr *MI, const MCSymbol *ORSym,
+ void generateFieldReloc(const MachineInstr *MI, const MCSymbol *ORSym,
DIType *RootTy, StringRef AccessPattern);
- /// Set the to-be-traversed Struct/Array Type based on TypeId.
- void setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType,
- BTFTypeArray **PrevArrayType);
-
/// Populating unprocessed struct type.
unsigned populateStructType(const DIType *Ty);
diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index 057bbf5c3b06..ef4e324c3bdd 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -39,7 +39,7 @@ unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
const MCFixup &Fixup,
bool IsPCRel) const {
// determine the type of the relocation
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getKind()) {
default:
llvm_unreachable("invalid fixup kind!");
case FK_SecRel_8:
@@ -85,5 +85,5 @@ unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
std::unique_ptr<MCObjectTargetWriter>
llvm::createBPFELFObjectWriter(uint8_t OSABI) {
- return llvm::make_unique<BPFELFObjectWriter>(OSABI);
+ return std::make_unique<BPFELFObjectWriter>(OSABI);
}
diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 0881bf841f90..590c4a2eb69d 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -702,7 +702,7 @@ bool HexagonAsmParser::ParseDirectiveFalign(unsigned Size, SMLoc L) {
// Make sure we have a number (false is returned if expression is a number)
if (!getParser().parseExpression(Value)) {
// Make sure this is a number that is in range
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+ auto *MCE = cast<MCConstantExpr>(Value);
uint64_t IntValue = MCE->getValue();
if (!isUIntN(Size, IntValue) && !isIntN(Size, IntValue))
return Error(ExprLoc, "literal value out of range (256) for falign");
diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp
index b7e95caf24fb..efd5ed915127 100644
--- a/lib/Target/Hexagon/BitTracker.cpp
+++ b/lib/Target/Hexagon/BitTracker.cpp
@@ -84,7 +84,7 @@ namespace {
raw_ostream &operator<< (raw_ostream &OS, const printv &PV) {
if (PV.R)
- OS << 'v' << TargetRegisterInfo::virtReg2Index(PV.R);
+ OS << 'v' << Register::virtReg2Index(PV.R);
else
OS << 's';
return OS;
@@ -201,7 +201,7 @@ BitTracker::~BitTracker() {
bool BT::RegisterCell::meet(const RegisterCell &RC, unsigned SelfR) {
// An example when "meet" can be invoked with SelfR == 0 is a phi node
// with a physical register as an operand.
- assert(SelfR == 0 || TargetRegisterInfo::isVirtualRegister(SelfR));
+ assert(SelfR == 0 || Register::isVirtualRegister(SelfR));
bool Changed = false;
for (uint16_t i = 0, n = Bits.size(); i < n; ++i) {
const BitValue &RCV = RC[i];
@@ -335,12 +335,13 @@ uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
// 1. find a physical register PhysR from the same class as RR.Reg,
// 2. find a physical register PhysS that corresponds to PhysR:RR.Sub,
// 3. find a register class that contains PhysS.
- if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) {
+ if (Register::isVirtualRegister(RR.Reg)) {
const auto &VC = composeWithSubRegIndex(*MRI.getRegClass(RR.Reg), RR.Sub);
return TRI.getRegSizeInBits(VC);
}
- assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg));
- unsigned PhysR = (RR.Sub == 0) ? RR.Reg : TRI.getSubReg(RR.Reg, RR.Sub);
+ assert(Register::isPhysicalRegister(RR.Reg));
+ Register PhysR =
+ (RR.Sub == 0) ? Register(RR.Reg) : TRI.getSubReg(RR.Reg, RR.Sub);
return getPhysRegBitWidth(PhysR);
}
@@ -350,10 +351,10 @@ BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR,
// Physical registers are assumed to be present in the map with an unknown
// value. Don't actually insert anything in the map, just return the cell.
- if (TargetRegisterInfo::isPhysicalRegister(RR.Reg))
+ if (Register::isPhysicalRegister(RR.Reg))
return RegisterCell::self(0, BW);
- assert(TargetRegisterInfo::isVirtualRegister(RR.Reg));
+ assert(Register::isVirtualRegister(RR.Reg));
// For virtual registers that belong to a class that is not tracked,
// generate an "unknown" value as well.
const TargetRegisterClass *C = MRI.getRegClass(RR.Reg);
@@ -376,7 +377,7 @@ void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
// While updating the cell map can be done in a meaningful way for
// a part of a register, it makes little sense to implement it as the
// SSA representation would never contain such "partial definitions".
- if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+ if (!Register::isVirtualRegister(RR.Reg))
return;
assert(RR.Sub == 0 && "Unexpected sub-register in definition");
// Eliminate all ref-to-reg-0 bit values: replace them with "self".
@@ -711,7 +712,7 @@ BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const {
}
uint16_t BT::MachineEvaluator::getPhysRegBitWidth(unsigned Reg) const {
- assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ assert(Register::isPhysicalRegister(Reg));
const TargetRegisterClass &PC = *TRI.getMinimalPhysRegClass(Reg);
return TRI.getRegSizeInBits(PC);
}
@@ -874,7 +875,7 @@ void BT::visitNonBranch(const MachineInstr &MI) {
continue;
RegisterRef RD(MO);
assert(RD.Sub == 0 && "Unexpected sub-register in definition");
- if (!TargetRegisterInfo::isVirtualRegister(RD.Reg))
+ if (!Register::isVirtualRegister(RD.Reg))
continue;
bool Changed = false;
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index b07d15609ede..3d771d388e28 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -130,7 +130,7 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
if (!MO.isReg())
return true;
- unsigned RegNumber = MO.getReg();
+ Register RegNumber = MO.getReg();
// This should be an assert in the frontend.
if (Hexagon::DoubleRegsRegClass.contains(RegNumber))
RegNumber = TRI->getSubReg(RegNumber, ExtraCode[0] == 'L' ?
diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 7b75d251ccd3..3068fb6f9629 100644
--- a/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -147,11 +147,11 @@ namespace {
}
static inline unsigned v2x(unsigned v) {
- return TargetRegisterInfo::virtReg2Index(v);
+ return Register::virtReg2Index(v);
}
static inline unsigned x2v(unsigned x) {
- return TargetRegisterInfo::index2VirtReg(x);
+ return Register::index2VirtReg(x);
}
};
@@ -290,8 +290,8 @@ void HexagonBitSimplify::getInstrDefs(const MachineInstr &MI,
for (auto &Op : MI.operands()) {
if (!Op.isReg() || !Op.isDef())
continue;
- unsigned R = Op.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ Register R = Op.getReg();
+ if (!Register::isVirtualRegister(R))
continue;
Defs.insert(R);
}
@@ -302,8 +302,8 @@ void HexagonBitSimplify::getInstrUses(const MachineInstr &MI,
for (auto &Op : MI.operands()) {
if (!Op.isReg() || !Op.isUse())
continue;
- unsigned R = Op.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ Register R = Op.getReg();
+ if (!Register::isVirtualRegister(R))
continue;
Uses.insert(R);
}
@@ -353,8 +353,7 @@ bool HexagonBitSimplify::getConst(const BitTracker::RegisterCell &RC,
bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR,
MachineRegisterInfo &MRI) {
- if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
- !TargetRegisterInfo::isVirtualRegister(NewR))
+ if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR))
return false;
auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
decltype(End) NextI;
@@ -367,8 +366,7 @@ bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR,
bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR,
unsigned NewSR, MachineRegisterInfo &MRI) {
- if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
- !TargetRegisterInfo::isVirtualRegister(NewR))
+ if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR))
return false;
if (hasTiedUse(OldR, MRI, NewSR))
return false;
@@ -384,8 +382,7 @@ bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR,
bool HexagonBitSimplify::replaceSubWithSub(unsigned OldR, unsigned OldSR,
unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI) {
- if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
- !TargetRegisterInfo::isVirtualRegister(NewR))
+ if (!Register::isVirtualRegister(OldR) || !Register::isVirtualRegister(NewR))
return false;
if (OldSR != NewSR && hasTiedUse(OldR, MRI, NewSR))
return false;
@@ -896,7 +893,7 @@ bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN,
// register class.
const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI) {
- if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+ if (!Register::isVirtualRegister(RR.Reg))
return nullptr;
auto *RC = MRI.getRegClass(RR.Reg);
if (RR.Sub == 0)
@@ -927,8 +924,8 @@ const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
// with a 32-bit register.
bool HexagonBitSimplify::isTransparentCopy(const BitTracker::RegisterRef &RD,
const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI) {
- if (!TargetRegisterInfo::isVirtualRegister(RD.Reg) ||
- !TargetRegisterInfo::isVirtualRegister(RS.Reg))
+ if (!Register::isVirtualRegister(RD.Reg) ||
+ !Register::isVirtualRegister(RS.Reg))
return false;
// Return false if one (or both) classes are nullptr.
auto *DRC = getFinalVRegClass(RD, MRI);
@@ -979,7 +976,7 @@ bool DeadCodeElimination::isDead(unsigned R) const {
continue;
if (UseI->isPHI()) {
assert(!UseI->getOperand(0).getSubReg());
- unsigned DR = UseI->getOperand(0).getReg();
+ Register DR = UseI->getOperand(0).getReg();
if (DR == R)
continue;
}
@@ -1018,8 +1015,8 @@ bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
for (auto &Op : MI->operands()) {
if (!Op.isReg() || !Op.isDef())
continue;
- unsigned R = Op.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R) || !isDead(R)) {
+ Register R = Op.getReg();
+ if (!Register::isVirtualRegister(R) || !isDead(R)) {
AllDead = false;
break;
}
@@ -1220,8 +1217,8 @@ bool RedundantInstrElimination::computeUsedBits(unsigned Reg, BitVector &Bits) {
return false;
MachineInstr &UseI = *I->getParent();
if (UseI.isPHI() || UseI.isCopy()) {
- unsigned DefR = UseI.getOperand(0).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(DefR))
+ Register DefR = UseI.getOperand(0).getReg();
+ if (!Register::isVirtualRegister(DefR))
return false;
Pending.push_back(DefR);
} else {
@@ -1345,7 +1342,7 @@ bool RedundantInstrElimination::processBlock(MachineBasicBlock &B,
// If found, replace the instruction with a COPY.
const DebugLoc &DL = MI->getDebugLoc();
const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
- unsigned NewR = MRI.createVirtualRegister(FRC);
+ Register NewR = MRI.createVirtualRegister(FRC);
MachineInstr *CopyI =
BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
.addReg(RS.Reg, 0, RS.Sub);
@@ -1412,7 +1409,7 @@ bool ConstGeneration::isTfrConst(const MachineInstr &MI) {
// register class and the actual value being transferred.
unsigned ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C,
MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL) {
- unsigned Reg = MRI.createVirtualRegister(RC);
+ Register Reg = MRI.createVirtualRegister(RC);
if (RC == &Hexagon::IntRegsRegClass) {
BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), Reg)
.addImm(int32_t(C));
@@ -1470,7 +1467,7 @@ bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) {
if (Defs.count() != 1)
continue;
unsigned DR = Defs.find_first();
- if (!TargetRegisterInfo::isVirtualRegister(DR))
+ if (!Register::isVirtualRegister(DR))
continue;
uint64_t U;
const BitTracker::RegisterCell &DRC = BT.lookup(DR);
@@ -1609,7 +1606,7 @@ bool CopyGeneration::processBlock(MachineBasicBlock &B,
auto *FRC = HBS::getFinalVRegClass(R, MRI);
if (findMatch(R, MR, AVB)) {
- unsigned NewR = MRI.createVirtualRegister(FRC);
+ Register NewR = MRI.createVirtualRegister(FRC);
BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
.addReg(MR.Reg, 0, MR.Sub);
BT.put(BitTracker::RegisterRef(NewR), BT.get(MR));
@@ -1628,7 +1625,7 @@ bool CopyGeneration::processBlock(MachineBasicBlock &B,
BitTracker::RegisterRef ML, MH;
if (findMatch(TL, ML, AVB) && findMatch(TH, MH, AVB)) {
auto *FRC = HBS::getFinalVRegClass(R, MRI);
- unsigned NewR = MRI.createVirtualRegister(FRC);
+ Register NewR = MRI.createVirtualRegister(FRC);
BuildMI(B, At, DL, HII.get(TargetOpcode::REG_SEQUENCE), NewR)
.addReg(ML.Reg, 0, ML.Sub)
.addImm(SubLo)
@@ -1819,7 +1816,7 @@ bool BitSimplification::matchHalf(unsigned SelfR,
if (Reg == 0 || Reg == SelfR) // Don't match "self".
return false;
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
return false;
if (!BT.has(Reg))
return false;
@@ -2025,7 +2022,7 @@ bool BitSimplification::genPackhl(MachineInstr *MI,
return false;
MachineBasicBlock &B = *MI->getParent();
- unsigned NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+ Register NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
DebugLoc DL = MI->getDebugLoc();
auto At = MI->isPHI() ? B.getFirstNonPHI()
: MachineBasicBlock::iterator(MI);
@@ -2097,7 +2094,7 @@ bool BitSimplification::genCombineHalf(MachineInstr *MI,
MachineBasicBlock &B = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
- unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
auto At = MI->isPHI() ? B.getFirstNonPHI()
: MachineBasicBlock::iterator(MI);
BuildMI(B, At, DL, HII.get(COpc), NewR)
@@ -2154,7 +2151,7 @@ bool BitSimplification::genExtractLow(MachineInstr *MI,
if (!validateReg(RS, NewOpc, 1))
continue;
- unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
auto At = MI->isPHI() ? B.getFirstNonPHI()
: MachineBasicBlock::iterator(MI);
auto MIB = BuildMI(B, At, DL, HII.get(NewOpc), NewR)
@@ -2368,7 +2365,7 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI,
return true;
}
} else if (V.is(0) || V.is(1)) {
- unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+ Register NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
unsigned NewOpc = V.is(0) ? Hexagon::PS_false : Hexagon::PS_true;
BuildMI(B, At, DL, HII.get(NewOpc), NewR);
HBS::replaceReg(RD.Reg, NewR, MRI);
@@ -2541,7 +2538,7 @@ bool BitSimplification::simplifyExtractLow(MachineInstr *MI,
DebugLoc DL = MI->getDebugLoc();
MachineBasicBlock &B = *MI->getParent();
- unsigned NewR = MRI.createVirtualRegister(FRC);
+ Register NewR = MRI.createVirtualRegister(FRC);
auto At = MI->isPHI() ? B.getFirstNonPHI()
: MachineBasicBlock::iterator(MI);
auto MIB = BuildMI(B, At, DL, HII.get(ExtOpc), NewR)
@@ -2612,8 +2609,8 @@ bool BitSimplification::simplifyRCmp0(MachineInstr *MI,
KnownNZ = true;
}
- auto ReplaceWithConst = [&] (int C) {
- unsigned NewR = MRI.createVirtualRegister(FRC);
+ auto ReplaceWithConst = [&](int C) {
+ Register NewR = MRI.createVirtualRegister(FRC);
BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), NewR)
.addImm(C);
HBS::replaceReg(RD.Reg, NewR, MRI);
@@ -2678,7 +2675,7 @@ bool BitSimplification::simplifyRCmp0(MachineInstr *MI,
// replace the comparison with a C2_muxii, using the same predicate
// register, but with operands substituted with 0/1 accordingly.
if ((KnownZ1 || KnownNZ1) && (KnownZ2 || KnownNZ2)) {
- unsigned NewR = MRI.createVirtualRegister(FRC);
+ Register NewR = MRI.createVirtualRegister(FRC);
BuildMI(B, At, DL, HII.get(Hexagon::C2_muxii), NewR)
.addReg(InpDef->getOperand(1).getReg())
.addImm(KnownZ1 == (Opc == Hexagon::A4_rcmpeqi))
@@ -3071,7 +3068,7 @@ void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
DenseMap<unsigned,unsigned> RegMap;
const TargetRegisterClass *PhiRC = MRI->getRegClass(NewPredR);
- unsigned PhiR = MRI->createVirtualRegister(PhiRC);
+ Register PhiR = MRI->createVirtualRegister(PhiRC);
BuildMI(LB, At, At->getDebugLoc(), HII->get(TargetOpcode::PHI), PhiR)
.addReg(NewPredR)
.addMBB(&PB)
@@ -3083,7 +3080,7 @@ void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
const MachineInstr *SI = G.Ins[i-1];
unsigned DR = getDefReg(SI);
const TargetRegisterClass *RC = MRI->getRegClass(DR);
- unsigned NewDR = MRI->createVirtualRegister(RC);
+ Register NewDR = MRI->createVirtualRegister(RC);
DebugLoc DL = SI->getDebugLoc();
auto MIB = BuildMI(LB, At, DL, HII->get(SI->getOpcode()), NewDR);
@@ -3162,7 +3159,7 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
if (Defs.count() != 1)
continue;
unsigned DefR = Defs.find_first();
- if (!TargetRegisterInfo::isVirtualRegister(DefR))
+ if (!Register::isVirtualRegister(DefR))
continue;
if (!isBitShuffle(&*I, DefR))
continue;
diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp
index ba50faac2cf9..ebd060ce503e 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -111,7 +111,7 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
}
uint16_t HexagonEvaluator::getPhysRegBitWidth(unsigned Reg) const {
- assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ assert(Register::isPhysicalRegister(Reg));
using namespace Hexagon;
const auto &HST = MF.getSubtarget<HexagonSubtarget>();
@@ -1042,8 +1042,8 @@ unsigned HexagonEvaluator::getUniqueDefVReg(const MachineInstr &MI) const {
for (const MachineOperand &Op : MI.operands()) {
if (!Op.isReg() || !Op.isDef())
continue;
- unsigned R = Op.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ Register R = Op.getReg();
+ if (!Register::isVirtualRegister(R))
continue;
if (DefReg != 0)
return 0;
@@ -1220,7 +1220,7 @@ bool HexagonEvaluator::evaluateFormalCopy(const MachineInstr &MI,
RegisterRef RD = MI.getOperand(0);
RegisterRef RS = MI.getOperand(1);
assert(RD.Sub == 0);
- if (!TargetRegisterInfo::isPhysicalRegister(RS.Reg))
+ if (!Register::isPhysicalRegister(RS.Reg))
return false;
RegExtMap::const_iterator F = VRX.find(RD.Reg);
if (F == VRX.end())
diff --git a/lib/Target/Hexagon/HexagonBlockRanges.cpp b/lib/Target/Hexagon/HexagonBlockRanges.cpp
index 999150fc8c6e..d1d1b8ee7d41 100644
--- a/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -268,14 +268,14 @@ HexagonBlockRanges::RegisterSet HexagonBlockRanges::expandToSubRegs(
return SRs;
}
- if (TargetRegisterInfo::isPhysicalRegister(R.Reg)) {
+ if (Register::isPhysicalRegister(R.Reg)) {
MCSubRegIterator I(R.Reg, &TRI);
if (!I.isValid())
SRs.insert({R.Reg, 0});
for (; I.isValid(); ++I)
SRs.insert({*I, 0});
} else {
- assert(TargetRegisterInfo::isVirtualRegister(R.Reg));
+ assert(Register::isVirtualRegister(R.Reg));
auto &RC = *MRI.getRegClass(R.Reg);
unsigned PReg = *RC.begin();
MCSubRegIndexIterator I(PReg, &TRI);
@@ -321,7 +321,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
if (!Op.isReg() || !Op.isUse() || Op.isUndef())
continue;
RegisterRef R = { Op.getReg(), Op.getSubReg() };
- if (TargetRegisterInfo::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
+ if (Register::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
continue;
bool IsKill = Op.isKill();
for (auto S : expandToSubRegs(R, MRI, TRI)) {
@@ -338,7 +338,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
continue;
RegisterRef R = { Op.getReg(), Op.getSubReg() };
for (auto S : expandToSubRegs(R, MRI, TRI)) {
- if (TargetRegisterInfo::isPhysicalRegister(S.Reg) && Reserved[S.Reg])
+ if (Register::isPhysicalRegister(S.Reg) && Reserved[S.Reg])
continue;
if (Op.isDead())
Clobbers.insert(S);
@@ -374,7 +374,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
// Update maps for defs.
for (RegisterRef S : Defs) {
// Defs should already be expanded into subregs.
- assert(!TargetRegisterInfo::isPhysicalRegister(S.Reg) ||
+ assert(!Register::isPhysicalRegister(S.Reg) ||
!MCSubRegIterator(S.Reg, &TRI, false).isValid());
if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
closeRange(S);
@@ -383,7 +383,7 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
// Update maps for clobbers.
for (RegisterRef S : Clobbers) {
// Clobbers should already be expanded into subregs.
- assert(!TargetRegisterInfo::isPhysicalRegister(S.Reg) ||
+ assert(!Register::isPhysicalRegister(S.Reg) ||
!MCSubRegIterator(S.Reg, &TRI, false).isValid());
if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
closeRange(S);
@@ -482,7 +482,7 @@ HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeDeadMap(
}
}
for (auto &P : LiveMap)
- if (TargetRegisterInfo::isVirtualRegister(P.first.Reg))
+ if (Register::isVirtualRegister(P.first.Reg))
addDeadRanges(P.first);
LLVM_DEBUG(dbgs() << __func__ << ": dead map\n"
diff --git a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
index ee93739b2c7b..08f740806879 100644
--- a/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
+++ b/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
@@ -105,12 +105,11 @@ void HexagonBranchRelaxation::computeOffset(MachineFunction &MF,
// offset of the current instruction from the start.
unsigned InstOffset = 0;
for (auto &B : MF) {
- if (B.getAlignment()) {
+ if (B.getAlignment() != Align::None()) {
// Although we don't know the exact layout of the final code, we need
// to account for alignment padding somehow. This heuristic pads each
// aligned basic block according to the alignment value.
- int ByteAlign = (1u << B.getAlignment()) - 1;
- InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign);
+ InstOffset = alignTo(InstOffset, B.getAlignment());
}
OffsetMap[&B] = InstOffset;
for (auto &MI : B.instrs()) {
diff --git a/lib/Target/Hexagon/HexagonConstExtenders.cpp b/lib/Target/Hexagon/HexagonConstExtenders.cpp
index cfed0ecef272..ddc9b847ef1c 100644
--- a/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -14,9 +14,10 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Pass.h"
#include <map>
#include <set>
#include <utility>
@@ -235,24 +236,24 @@ namespace {
Reg = Op.getReg();
Sub = Op.getSubReg();
} else if (Op.isFI()) {
- Reg = TargetRegisterInfo::index2StackSlot(Op.getIndex());
+ Reg = llvm::Register::index2StackSlot(Op.getIndex());
}
return *this;
}
bool isVReg() const {
- return Reg != 0 && !TargetRegisterInfo::isStackSlot(Reg) &&
- TargetRegisterInfo::isVirtualRegister(Reg);
+ return Reg != 0 && !llvm::Register::isStackSlot(Reg) &&
+ llvm::Register::isVirtualRegister(Reg);
}
bool isSlot() const {
- return Reg != 0 && TargetRegisterInfo::isStackSlot(Reg);
+ return Reg != 0 && llvm::Register::isStackSlot(Reg);
}
operator MachineOperand() const {
if (isVReg())
return MachineOperand::CreateReg(Reg, /*Def*/false, /*Imp*/false,
/*Kill*/false, /*Dead*/false, /*Undef*/false,
/*EarlyClobber*/false, Sub);
- if (TargetRegisterInfo::isStackSlot(Reg)) {
- int FI = TargetRegisterInfo::stackSlot2Index(Reg);
+ if (llvm::Register::isStackSlot(Reg)) {
+ int FI = llvm::Register::stackSlot2Index(Reg);
return MachineOperand::CreateFI(FI);
}
llvm_unreachable("Cannot create MachineOperand");
@@ -1524,7 +1525,7 @@ void HCE::calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs,
}
HCE::Register HCE::insertInitializer(Loc DefL, const ExtenderInit &ExtI) {
- unsigned DefR = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+ llvm::Register DefR = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
MachineBasicBlock &MBB = *DefL.Block;
MachineBasicBlock::iterator At = DefL.At;
DebugLoc dl = DefL.Block->findDebugLoc(DefL.At);
diff --git a/lib/Target/Hexagon/HexagonConstPropagation.cpp b/lib/Target/Hexagon/HexagonConstPropagation.cpp
index d1fde5da5fe8..a82501cabb9b 100644
--- a/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -208,14 +208,14 @@ namespace {
bool has(unsigned R) const {
// All non-virtual registers are considered "bottom".
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ if (!Register::isVirtualRegister(R))
return true;
MapType::const_iterator F = Map.find(R);
return F != Map.end();
}
const LatticeCell &get(unsigned R) const {
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ if (!Register::isVirtualRegister(R))
return Bottom;
MapType::const_iterator F = Map.find(R);
if (F != Map.end())
@@ -623,7 +623,7 @@ void MachineConstPropagator::visitPHI(const MachineInstr &PN) {
const MachineOperand &MD = PN.getOperand(0);
RegisterSubReg DefR(MD);
- assert(TargetRegisterInfo::isVirtualRegister(DefR.Reg));
+ assert(Register::isVirtualRegister(DefR.Reg));
bool Changed = false;
@@ -652,7 +652,7 @@ Bottomize:
RegisterSubReg UseR(SO);
// If the input is not a virtual register, we don't really know what
// value it holds.
- if (!TargetRegisterInfo::isVirtualRegister(UseR.Reg))
+ if (!Register::isVirtualRegister(UseR.Reg))
goto Bottomize;
// If there is no cell for an input register, it means top.
if (!Cells.has(UseR.Reg))
@@ -694,7 +694,7 @@ void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) {
continue;
RegisterSubReg DefR(MO);
// Only track virtual registers.
- if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg))
+ if (!Register::isVirtualRegister(DefR.Reg))
continue;
bool Changed = false;
// If the evaluation failed, set cells for all output registers to bottom.
@@ -1070,7 +1070,7 @@ bool MachineConstPropagator::run(MachineFunction &MF) {
bool MachineConstEvaluator::getCell(const RegisterSubReg &R, const CellMap &Inputs,
LatticeCell &RC) {
- if (!TargetRegisterInfo::isVirtualRegister(R.Reg))
+ if (!Register::isVirtualRegister(R.Reg))
return false;
const LatticeCell &L = Inputs.get(R.Reg);
if (!R.SubReg) {
@@ -1926,7 +1926,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
unsigned Opc = MI.getOpcode();
RegisterSubReg DefR(MD);
assert(!DefR.SubReg);
- if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg))
+ if (!Register::isVirtualRegister(DefR.Reg))
return false;
if (MI.isCopy()) {
@@ -2793,7 +2793,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
if (!MO.isReg() || !MO.isUse() || MO.isImplicit())
continue;
RegisterSubReg R(MO);
- if (!TargetRegisterInfo::isVirtualRegister(R.Reg))
+ if (!Register::isVirtualRegister(R.Reg))
continue;
HasUse = true;
// PHIs can legitimately have "top" cells after propagation.
@@ -2813,7 +2813,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || !MO.isUse() || MO.isImplicit())
continue;
- unsigned R = MO.getReg();
+ Register R = MO.getReg();
dbgs() << printReg(R, &TRI) << ": " << Inputs.get(R) << "\n";
}
}
@@ -2831,8 +2831,8 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || !MO.isDef())
continue;
- unsigned R = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ Register R = MO.getReg();
+ if (!Register::isVirtualRegister(R))
continue;
assert(!MO.getSubReg());
assert(Inputs.has(R));
@@ -2871,7 +2871,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
const MCInstrDesc *NewD = (Ps & P::Zero) ?
&HII.get(Hexagon::PS_false) :
&HII.get(Hexagon::PS_true);
- unsigned NewR = MRI->createVirtualRegister(PredRC);
+ Register NewR = MRI->createVirtualRegister(PredRC);
const MachineInstrBuilder &MIB = BuildMI(B, At, DL, *NewD, NewR);
(void)MIB;
#ifndef NDEBUG
@@ -2893,7 +2893,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
NewRC = &Hexagon::IntRegsRegClass;
else
NewRC = &Hexagon::DoubleRegsRegClass;
- unsigned NewR = MRI->createVirtualRegister(NewRC);
+ Register NewR = MRI->createVirtualRegister(NewRC);
const MachineInstr *NewMI;
if (W == 32) {
@@ -3009,7 +3009,7 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
if (V < 0)
V = -V;
const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg);
- unsigned NewR = MRI->createVirtualRegister(RC);
+ Register NewR = MRI->createVirtualRegister(RC);
const MachineOperand &Src1 = MI.getOperand(1);
NewMI = BuildMI(B, At, DL, D, NewR)
.addReg(Src1.getReg(), getRegState(Src1), Src1.getSubReg())
@@ -3111,8 +3111,8 @@ bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
void HexagonConstEvaluator::replaceAllRegUsesWith(unsigned FromReg,
unsigned ToReg) {
- assert(TargetRegisterInfo::isVirtualRegister(FromReg));
- assert(TargetRegisterInfo::isVirtualRegister(ToReg));
+ assert(Register::isVirtualRegister(FromReg));
+ assert(Register::isVirtualRegister(ToReg));
for (auto I = MRI->use_begin(FromReg), E = MRI->use_end(); I != E;) {
MachineOperand &O = *I;
++I;
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index a09ccab483cf..394a329ac447 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -133,8 +133,8 @@ static bool isCombinableInstType(MachineInstr &MI, const HexagonInstrInfo *TII,
const MachineOperand &Op1 = MI.getOperand(1);
assert(Op0.isReg() && Op1.isReg());
- unsigned DestReg = Op0.getReg();
- unsigned SrcReg = Op1.getReg();
+ Register DestReg = Op0.getReg();
+ Register SrcReg = Op1.getReg();
return Hexagon::IntRegsRegClass.contains(DestReg) &&
Hexagon::IntRegsRegClass.contains(SrcReg);
}
@@ -146,7 +146,7 @@ static bool isCombinableInstType(MachineInstr &MI, const HexagonInstrInfo *TII,
const MachineOperand &Op1 = MI.getOperand(1);
assert(Op0.isReg());
- unsigned DestReg = Op0.getReg();
+ Register DestReg = Op0.getReg();
// Ensure that TargetFlags are MO_NO_FLAG for a global. This is a
// workaround for an ABI bug that prevents GOT relocations on combine
// instructions
@@ -226,7 +226,7 @@ static bool areCombinableOperations(const TargetRegisterInfo *TRI,
}
static bool isEvenReg(unsigned Reg) {
- assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ assert(Register::isPhysicalRegister(Reg));
if (Hexagon::IntRegsRegClass.contains(Reg))
return (Reg - Hexagon::R0) % 2 == 0;
if (Hexagon::HvxVRRegClass.contains(Reg))
@@ -265,7 +265,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1,
unsigned I1DestReg,
unsigned I2DestReg,
bool &DoInsertAtI1) {
- unsigned I2UseReg = UseReg(I2.getOperand(1));
+ Register I2UseReg = UseReg(I2.getOperand(1));
// It is not safe to move I1 and I2 into one combine if I2 has a true
// dependence on I1.
@@ -332,7 +332,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1,
// At O3 we got better results (dhrystone) by being more conservative here.
if (!ShouldCombineAggressively)
End = std::next(MachineBasicBlock::iterator(I2));
- unsigned I1UseReg = UseReg(I1.getOperand(1));
+ Register I1UseReg = UseReg(I1.getOperand(1));
// Track killed operands. If we move across an instruction that kills our
// operand, we need to update the kill information on the moved I1. It kills
// the operand now.
@@ -410,7 +410,7 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
continue;
// Look for the defining instruction.
- unsigned Reg = Op.getReg();
+ Register Reg = Op.getReg();
MachineInstr *DefInst = LastDef[Reg];
if (!DefInst)
continue;
@@ -442,7 +442,7 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
if (Op.isReg()) {
if (!Op.isDef() || !Op.getReg())
continue;
- unsigned Reg = Op.getReg();
+ Register Reg = Op.getReg();
if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
LastDef[*SubRegs] = &MI;
@@ -528,7 +528,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1,
while (I2 != I1.getParent()->end() && I2->isDebugInstr())
++I2;
- unsigned I1DestReg = I1.getOperand(0).getReg();
+ Register I1DestReg = I1.getOperand(0).getReg();
for (MachineBasicBlock::iterator End = I1.getParent()->end(); I2 != End;
++I2) {
@@ -544,7 +544,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1,
if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(&*I2))
continue;
- unsigned I2DestReg = I2->getOperand(0).getReg();
+ Register I2DestReg = I2->getOperand(0).getReg();
// Check that registers are adjacent and that the first destination register
// is even.
@@ -579,8 +579,8 @@ void HexagonCopyToCombine::combine(MachineInstr &I1, MachineInstr &I2,
++MI;
// Figure out whether I1 or I2 goes into the lowreg part.
- unsigned I1DestReg = I1.getOperand(0).getReg();
- unsigned I2DestReg = I2.getOperand(0).getReg();
+ Register I1DestReg = I1.getOperand(0).getReg();
+ Register I2DestReg = I2.getOperand(0).getReg();
bool IsI1Loreg = (I2DestReg - I1DestReg) == 1;
unsigned LoRegDef = IsI1Loreg ? I1DestReg : I2DestReg;
unsigned SubLo;
@@ -758,7 +758,7 @@ void HexagonCopyToCombine::emitCombineIR(MachineBasicBlock::iterator &InsertPt,
unsigned DoubleDestReg,
MachineOperand &HiOperand,
MachineOperand &LoOperand) {
- unsigned LoReg = LoOperand.getReg();
+ Register LoReg = LoOperand.getReg();
unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
DebugLoc DL = InsertPt->getDebugLoc();
@@ -807,7 +807,7 @@ void HexagonCopyToCombine::emitCombineRI(MachineBasicBlock::iterator &InsertPt,
MachineOperand &HiOperand,
MachineOperand &LoOperand) {
unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
- unsigned HiReg = HiOperand.getReg();
+ Register HiReg = HiOperand.getReg();
DebugLoc DL = InsertPt->getDebugLoc();
MachineBasicBlock *BB = InsertPt->getParent();
@@ -857,8 +857,8 @@ void HexagonCopyToCombine::emitCombineRR(MachineBasicBlock::iterator &InsertPt,
MachineOperand &LoOperand) {
unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
- unsigned LoReg = LoOperand.getReg();
- unsigned HiReg = HiOperand.getReg();
+ Register LoReg = LoOperand.getReg();
+ Register HiReg = HiOperand.getReg();
DebugLoc DL = InsertPt->getDebugLoc();
MachineBasicBlock *BB = InsertPt->getParent();
diff --git a/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
index 2ce1419e4790..e4a2ba0ec29c 100644
--- a/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
+++ b/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -37,12 +37,12 @@ def: Pat<(int_hexagon_F2_sfmax IntRegs:$src1, IntRegs:$src2),
(F2_sfmax IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vabswsat DoubleRegs:$src1),
(A2_vabswsat DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$src1, u5_0ImmPred:$src2),
- (S2_asr_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2),
- (S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_combineri IntRegs:$src1, s32_0ImmPred:$src2),
- (A4_combineri IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+ (S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -75,8 +75,8 @@ def: Pat<(int_hexagon_A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_maxup DoubleRegs:$src1, DoubleRegs:$src2),
(A2_maxup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred:$src2),
- (A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+ (A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_interleave DoubleRegs:$src1),
(S2_interleave DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2),
@@ -89,10 +89,10 @@ def: Pat<(int_hexagon_C2_cmpgtu IntRegs:$src1, IntRegs:$src2),
(C2_cmpgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2),
(C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmphgtui IntRegs:$src1, u32_0ImmPred:$src2),
- (A4_cmphgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgti IntRegs:$src1, s32_0ImmPred:$src2),
- (C2_cmpgti IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+ (A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyi IntRegs:$src1, IntRegs:$src2),
(M2_mpyi IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_conv_df2uw_chop DoubleRegs:$src1),
@@ -103,12 +103,12 @@ def: Pat<(int_hexagon_M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2),
(M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2),
(M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
(S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_extractup DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3),
- (S2_extractup DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3),
+ (S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S4_ntstbit_r IntRegs:$src1, IntRegs:$src2),
(S4_ntstbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_conv_w2sf IntRegs:$src1),
@@ -125,10 +125,10 @@ def: Pat<(int_hexagon_A4_cmpbgt IntRegs:$src1, IntRegs:$src2),
(A4_cmpbgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_rcmpneqi IntRegs:$src1, s32_0ImmPred:$src2),
- (A4_rcmpneqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A4_rcmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_orp DoubleRegs:$src1, DoubleRegs:$src2),
@@ -137,28 +137,28 @@ def: Pat<(int_hexagon_M2_mpyu_up IntRegs:$src1, IntRegs:$src2),
(M2_mpyu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2),
- (S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2),
- (S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+ (S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_cmpbgtu IntRegs:$src1, IntRegs:$src2),
(A4_cmpbgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2),
(A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbgti IntRegs:$src1, s8_0ImmPred:$src2),
- (A4_cmpbgti IntRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2),
+ (A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2),
(M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
(S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_addsp IntRegs:$src1, DoubleRegs:$src2),
(A2_addsp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2),
(S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred:$src2),
- (A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+ (A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2),
(S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_pmpyw IntRegs:$src1, IntRegs:$src2),
@@ -177,10 +177,10 @@ def: Pat<(int_hexagon_A2_pxorf PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(A2_pxorf PredRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred:$src2),
- (S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$src1, u5_0ImmPred:$src2),
- (S2_asl_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+ (S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
(A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -199,10 +199,10 @@ def: Pat<(int_hexagon_M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
(M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_bitsset IntRegs:$src1, IntRegs:$src2),
(C2_bitsset IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpysip IntRegs:$src1, u32_0ImmPred:$src2),
- (M2_mpysip IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpysin IntRegs:$src1, u8_0ImmPred:$src2),
- (M2_mpysin IntRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysip IntRegs:$src1, u32_0ImmPred_timm:$src2),
+ (M2_mpysip IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysin IntRegs:$src1, u8_0ImmPred_timm:$src2),
+ (M2_mpysin IntRegs:$src1, u8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_boundscheck IntRegs:$src1, DoubleRegs:$src2),
(A4_boundscheck IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2),
@@ -225,10 +225,10 @@ def: Pat<(int_hexagon_F2_conv_ud2df DoubleRegs:$src1),
(F2_conv_ud2df DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_subi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S4_subi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_vzxthw IntRegs:$src1),
(S2_vzxthw IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_sfadd IntRegs:$src1, IntRegs:$src2),
@@ -241,12 +241,12 @@ def: Pat<(int_hexagon_M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$sr
(M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2),
(M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3, u5_0ImmPred:$src4),
- (S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3, u5_0ImmPred:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4),
+ (S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_packhl IntRegs:$src1, IntRegs:$src2),
(S2_packhl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred:$src2),
- (A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+ (A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -259,8 +259,8 @@ def: Pat<(int_hexagon_M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_conv_d2df DoubleRegs:$src1),
(F2_conv_d2df DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgtui IntRegs:$src1, u32_0ImmPred:$src2),
- (C2_cmpgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+ (C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vconj DoubleRegs:$src1),
(A2_vconj DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2),
@@ -279,8 +279,8 @@ def: Pat<(int_hexagon_C2_any8 PredRegs:$src1),
(C2_any8 PredRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_togglebit_r IntRegs:$src1, IntRegs:$src2),
(S2_togglebit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_togglebit_i IntRegs:$src1, u5_0ImmPred:$src2),
- (S2_togglebit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_conv_uw2sf IntRegs:$src1),
(F2_conv_uw2sf IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_vsathb_nopack DoubleRegs:$src1),
@@ -303,10 +303,10 @@ def: Pat<(int_hexagon_C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)
(C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred:$src2),
- (A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
+ (A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
@@ -323,34 +323,34 @@ def: Pat<(int_hexagon_M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleR
(M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
(A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2),
(M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfrsi s32_0ImmPred:$src1),
- (A2_tfrsi s32_0ImmPred:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrsi s32_0ImmPred_timm:$src1),
+ (A2_tfrsi s32_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_svnavgh IntRegs:$src1, IntRegs:$src2),
(A2_svnavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$src1, u5_0ImmPred:$src2),
- (S2_lsr_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred:$src2),
- (A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
+ (A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_svavgh IntRegs:$src1, IntRegs:$src2),
(A2_svavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2),
- (S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+ (S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_combine_hl IntRegs:$src1, IntRegs:$src2),
(A2_combine_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_up IntRegs:$src1, IntRegs:$src2),
@@ -381,10 +381,10 @@ def: Pat<(int_hexagon_M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3
(M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyrr_addi u32_0ImmPred:$src1, IntRegs:$src2, IntRegs:$src3),
- (M4_mpyrr_addi u32_0ImmPred:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
- (S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3),
+ (M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+ (S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2),
(M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2),
@@ -453,8 +453,8 @@ def: Pat<(int_hexagon_M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
(M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmpneqi IntRegs:$src1, s32_0ImmPred:$src2),
- (C4_cmpneqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_sat DoubleRegs:$src1),
@@ -469,8 +469,8 @@ def: Pat<(int_hexagon_A2_svavghs IntRegs:$src1, IntRegs:$src2),
(A2_svavghs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_bitsclri IntRegs:$src1, u6_0ImmPred:$src2),
- (C2_bitsclri IntRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2),
+ (C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2),
(A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2),
@@ -535,10 +535,10 @@ def: Pat<(int_hexagon_C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3
(C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_parityp DoubleRegs:$src1, DoubleRegs:$src2),
(S2_parityp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -557,30 +557,30 @@ def: Pat<(int_hexagon_M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src
(M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3),
- (S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3),
+ (S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_tstbit_r IntRegs:$src1, IntRegs:$src2),
(S2_tstbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3),
- (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3),
+ (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_tstbit_i IntRegs:$src1, u5_0ImmPred:$src2),
- (S2_tstbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2),
(M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2),
(S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
(M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2),
- (S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
(M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
@@ -605,14 +605,14 @@ def: Pat<(int_hexagon_F2_conv_w2df IntRegs:$src1),
(F2_conv_w2df IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
(A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpeqi IntRegs:$src1, s32_0ImmPred:$src2),
- (C2_cmpeqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_vcnegh DoubleRegs:$src1, IntRegs:$src2),
(S2_vcnegh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred:$src2),
- (A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+ (A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2),
(M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2),
@@ -633,8 +633,8 @@ def: Pat<(int_hexagon_S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3
(S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_cl0p DoubleRegs:$src1),
(S2_cl0p DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3),
- (S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3),
+ (S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_sffixupd IntRegs:$src1, IntRegs:$src2),
(F2_sffixupd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
@@ -653,8 +653,8 @@ def: Pat<(int_hexagon_M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs
(M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
(M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_ntstbit_i IntRegs:$src1, u5_0ImmPred:$src2),
- (S4_ntstbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_sffixupr IntRegs:$src1),
(F2_sffixupr IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
@@ -669,32 +669,32 @@ def: Pat<(int_hexagon_C2_andn PredRegs:$src1, PredRegs:$src2),
(C2_andn PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2),
(M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
- (S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+ (S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_rcmpeqi IntRegs:$src1, s32_0ImmPred:$src2),
- (A4_rcmpeqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
(M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_round_ri IntRegs:$src1, u5_0ImmPred:$src2),
- (A4_round_ri IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_max IntRegs:$src1, IntRegs:$src2),
(A2_max IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_round_rr IntRegs:$src1, IntRegs:$src2),
(A4_round_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_combineii s8_0ImmPred:$src1, u32_0ImmPred:$src2),
- (A4_combineii s8_0ImmPred:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_combineir s32_0ImmPred:$src1, IntRegs:$src2),
- (A4_combineir s32_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineii s8_0ImmPred_timm:$src1, u32_0ImmPred_timm:$src2),
+ (A4_combineii s8_0ImmPred_timm:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2),
+ (A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
(C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -703,8 +703,8 @@ def: Pat<(int_hexagon_A4_rcmpeq IntRegs:$src1, IntRegs:$src2),
(A4_rcmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2),
(M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_vzxtbh IntRegs:$src1),
(S2_vzxtbh IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
@@ -721,8 +721,8 @@ def: Pat<(int_hexagon_M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2),
(M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
(S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_ori_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S4_ori_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C4_nbitsset IntRegs:$src1, IntRegs:$src2),
(C4_nbitsset IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -745,10 +745,10 @@ def: Pat<(int_hexagon_M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs
(M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfimm_p u10_0ImmPred:$src1),
- (F2_sfimm_p u10_0ImmPred:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfimm_n u10_0ImmPred:$src1),
- (F2_sfimm_n u10_0ImmPred:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfimm_p u10_0ImmPred_timm:$src1),
+ (F2_sfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfimm_n u10_0ImmPred_timm:$src1),
+ (F2_sfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2),
(M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
@@ -759,14 +759,14 @@ def: Pat<(int_hexagon_F2_conv_d2sf DoubleRegs:$src1),
(F2_conv_d2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbeqi IntRegs:$src1, u8_0ImmPred:$src2),
- (A4_cmpbeqi IntRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2),
+ (A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_sfcmpuo IntRegs:$src1, IntRegs:$src2),
(F2_sfcmpuo IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_vsatwh_nopack DoubleRegs:$src1),
(S2_vsatwh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2),
@@ -783,8 +783,8 @@ def: Pat<(int_hexagon_M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_minp DoubleRegs:$src1, DoubleRegs:$src2),
(A2_minp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
- (S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+ (S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
(M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
@@ -817,16 +817,16 @@ def: Pat<(int_hexagon_S4_extract_rp IntRegs:$src1, DoubleRegs:$src2),
(S4_extract_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmplteui IntRegs:$src1, u32_0ImmPred:$src2),
- (C4_cmplteui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_addi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S4_addi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+ (C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_tfrcpp CtrRegs64:$src1),
(A4_tfrcpp CtrRegs64:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred:$src2),
- (S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmphgti IntRegs:$src1, s32_0ImmPred:$src2),
- (A4_cmphgti IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
(A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
@@ -837,8 +837,8 @@ def: Pat<(int_hexagon_S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRe
(S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_subi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S4_subi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2),
(S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2),
@@ -851,14 +851,14 @@ def: Pat<(int_hexagon_S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs
(S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_satb IntRegs:$src1),
(A2_satb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3, u6_0ImmPred:$src4),
- (S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3, u6_0ImmPred:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4),
+ (S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
(M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
(M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2),
(S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2),
@@ -925,8 +925,8 @@ def: Pat<(int_hexagon_M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2),
(M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2),
(M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_muxri PredRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3),
- (C2_muxri PredRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_muxri PredRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3),
+ (C2_muxri PredRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
@@ -937,8 +937,8 @@ def: Pat<(int_hexagon_M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2),
(M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2),
(M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -947,8 +947,8 @@ def: Pat<(int_hexagon_S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs
(S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -957,16 +957,16 @@ def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs
(M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2),
(C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyri_addi u32_0ImmPred:$src1, IntRegs:$src2, u6_0ImmPred:$src3),
- (M4_mpyri_addi u32_0ImmPred:$src1, IntRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_andi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S4_andi_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3),
- (M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3),
+ (M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
+ (M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_tfrcrr CtrRegs:$src1),
(A2_tfrcrr CtrRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3),
- (M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
+ (M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_orn PredRegs:$src1, PredRegs:$src2),
(C2_orn PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_and_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -1005,8 +1005,8 @@ def: Pat<(int_hexagon_M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntR
(M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2),
(F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
- (M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+ (M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2),
(A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2),
@@ -1017,10 +1017,10 @@ def: Pat<(int_hexagon_A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_vsxthw IntRegs:$src1),
(S2_vsxthw IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_andi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S4_andi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
(S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_cmpgt IntRegs:$src1, IntRegs:$src2),
@@ -1035,22 +1035,22 @@ def: Pat<(int_hexagon_F2_conv_sf2w IntRegs:$src1),
(F2_conv_sf2w IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
(S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfclass IntRegs:$src1, u5_0ImmPred:$src2),
- (F2_sfclass IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred:$src3),
- (S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred_timm:$src3),
+ (S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
(M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addi IntRegs:$src1, s32_0ImmPred:$src2),
- (A2_addi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_addp DoubleRegs:$src1, DoubleRegs:$src2),
(A2_addp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2),
@@ -1063,8 +1063,8 @@ def: Pat<(int_hexagon_M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2),
(S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
(M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
@@ -1131,12 +1131,12 @@ def: Pat<(int_hexagon_C2_and PredRegs:$src1, PredRegs:$src2),
(C2_and PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S5_popcountp DoubleRegs:$src1),
(S5_popcountp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_extractp DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3),
- (S4_extractp DoubleRegs:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3),
+ (S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_cl0 IntRegs:$src1),
(S2_cl0 IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred:$src2),
- (A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+ (A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
@@ -1167,8 +1167,8 @@ def: Pat<(int_hexagon_M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_bitspliti IntRegs:$src1, u5_0ImmPred:$src2),
- (A4_bitspliti IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2),
@@ -1185,26 +1185,26 @@ def: Pat<(int_hexagon_F2_conv_sf2d IntRegs:$src1),
(F2_conv_sf2d IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfimm_n u10_0ImmPred:$src1),
- (F2_dfimm_n u10_0ImmPred:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfimm_n u10_0ImmPred_timm:$src1),
+ (F2_dfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_cmphgt IntRegs:$src1, IntRegs:$src2),
(A4_cmphgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfimm_p u10_0ImmPred:$src1),
- (F2_dfimm_p u10_0ImmPred:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfimm_p u10_0ImmPred_timm:$src1),
+ (F2_dfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2),
(M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred:$src2, IntRegs:$src3),
- (M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3),
+ (M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2),
(M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
(S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
(M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3),
- (S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3),
+ (S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_cnacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -1215,20 +1215,20 @@ def: Pat<(int_hexagon_A2_maxu IntRegs:$src1, IntRegs:$src2),
(A2_maxu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_maxp DoubleRegs:$src1, DoubleRegs:$src2),
(A2_maxp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_andir IntRegs:$src1, s32_0ImmPred:$src2),
- (A2_andir IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_sfrecipa IntRegs:$src1, IntRegs:$src2),
(F2_sfrecipa IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combineii s32_0ImmPred:$src1, s8_0ImmPred:$src2),
- (A2_combineii s32_0ImmPred:$src1, s8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2),
+ (A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_orn IntRegs:$src1, IntRegs:$src2),
(A4_orn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbgtui IntRegs:$src1, u32_0ImmPred:$src2),
- (A4_cmpbgtui IntRegs:$src1, u32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+ (A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred:$src2),
- (A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2),
+ (A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsl_r_r IntRegs:$src1, IntRegs:$src2),
(S2_lsl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2),
@@ -1251,16 +1251,16 @@ def: Pat<(int_hexagon_A2_satub IntRegs:$src1),
(A2_satub IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2),
(M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
- (S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+ (S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2),
(C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfrih IntRegs:$src1, u16_0ImmPred:$src2),
- (A2_tfrih IntRegs:$src1, u16_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfril IntRegs:$src1, u16_0ImmPred:$src2),
- (A2_tfril IntRegs:$src1, u16_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3),
- (M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2),
+ (A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2),
+ (A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
+ (M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_vtrunehb DoubleRegs:$src1),
(S2_vtrunehb DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vabsw DoubleRegs:$src1),
@@ -1269,14 +1269,14 @@ def: Pat<(int_hexagon_A2_vabsh DoubleRegs:$src1),
(A2_vabsh DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_sfsub IntRegs:$src1, IntRegs:$src2),
(F2_sfsub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_muxii PredRegs:$src1, s32_0ImmPred:$src2, s8_0ImmPred:$src3),
- (C2_muxii PredRegs:$src1, s32_0ImmPred:$src2, s8_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
- (C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_muxii PredRegs:$src1, s32_0ImmPred_timm:$src2, s8_0ImmPred_timm:$src3),
+ (C2_muxii PredRegs:$src1, s32_0ImmPred_timm:$src2, s8_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+ (C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_swiz IntRegs:$src1),
(A2_swiz IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2),
(M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2),
@@ -1295,44 +1295,44 @@ def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs
(M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_extract IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3),
- (S4_extract IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3),
+ (S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_conv_ud2sf DoubleRegs:$src1),
(F2_conv_ud2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_tfr IntRegs:$src1),
(A2_tfr IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subri s32_0ImmPred:$src1, IntRegs:$src2),
- (A2_subri s32_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2),
+ (A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
(A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M5_vmpybuu IntRegs:$src1, IntRegs:$src2),
(M5_vmpybuu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
(A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2),
- (S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_brev IntRegs:$src1),
(S2_brev IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_clrbit_i IntRegs:$src1, u5_0ImmPred:$src2),
- (S2_clrbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2),
- (S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+ (S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
@@ -1343,8 +1343,8 @@ def: Pat<(int_hexagon_M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2),
(M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2),
(M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3),
- (M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+ (M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_vrndpackwhs DoubleRegs:$src1),
(S2_vrndpackwhs DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2),
@@ -1357,24 +1357,24 @@ def: Pat<(int_hexagon_M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2),
(M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred:$src4),
- (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4),
+ (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_conv_uw2df IntRegs:$src1),
(F2_conv_uw2df IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_orir IntRegs:$src1, s32_0ImmPred:$src2),
- (A2_orir IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_andp DoubleRegs:$src1, DoubleRegs:$src2),
(A2_andp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2),
(S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_min IntRegs:$src1, IntRegs:$src2),
(A2_min IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpysmi IntRegs:$src1, m32_0ImmPred:$src2),
- (M2_mpysmi IntRegs:$src1, m32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2),
+ (M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2),
(M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -1397,10 +1397,10 @@ def: Pat<(int_hexagon_M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2),
(M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_conv_df2w DoubleRegs:$src1),
(F2_conv_df2w DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred:$src2),
- (S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+ (S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_conv_df2d DoubleRegs:$src1),
(F2_conv_df2d DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
@@ -1423,8 +1423,8 @@ def: Pat<(int_hexagon_A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4),
(F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfclass DoubleRegs:$src1, u5_0ImmPred:$src2),
- (F2_dfclass DoubleRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+ (F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_conv_df2ud DoubleRegs:$src1),
(F2_conv_df2ud DoubleRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_conv_df2uw DoubleRegs:$src1),
@@ -1433,16 +1433,16 @@ def: Pat<(int_hexagon_M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2),
(M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2),
(M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmpltei IntRegs:$src1, s32_0ImmPred:$src2),
- (C4_cmpltei IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C4_cmplteu IntRegs:$src1, IntRegs:$src2),
(C4_cmplteu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vsubb_map DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2),
(A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred:$src2),
- (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2),
(M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
@@ -1471,14 +1471,14 @@ def: Pat<(int_hexagon_M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2),
(M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2),
(M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred:$src2),
- (S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+ (S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2),
(A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_svaddhs IntRegs:$src1, IntRegs:$src2),
(A2_svaddhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_ori_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S4_ori_lsr_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
(M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
@@ -1499,8 +1499,8 @@ def: Pat<(int_hexagon_M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2),
(M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_lsli s6_0ImmPred:$src1, IntRegs:$src2),
- (S4_lsli s6_0ImmPred:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_lsli s6_0ImmPred_timm:$src1, IntRegs:$src2),
+ (S4_lsli s6_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2),
(S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2),
@@ -1529,8 +1529,8 @@ def: Pat<(int_hexagon_A4_cmpbeq IntRegs:$src1, IntRegs:$src2),
(A4_cmpbeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_negp DoubleRegs:$src1),
(A2_negp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred:$src2),
- (S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
(A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_vsatwuh DoubleRegs:$src1),
@@ -1541,10 +1541,10 @@ def: Pat<(int_hexagon_S2_svsathb IntRegs:$src1),
(S2_svsathb IntRegs:$src1)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2),
(C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cround_ri IntRegs:$src1, u5_0ImmPred:$src2),
- (A4_cround_ri IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred:$src2),
- (S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2),
+ (S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_cround_rr IntRegs:$src1, IntRegs:$src2),
(A4_cround_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C2_mux PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -1563,12 +1563,12 @@ def: Pat<(int_hexagon_A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_vminub DoubleRegs:$src1, DoubleRegs:$src2),
(A2_vminub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_extractu IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3),
- (S2_extractu IntRegs:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3),
+ (S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A2_svsubh IntRegs:$src1, IntRegs:$src2),
(A2_svsubh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_clbaddi IntRegs:$src1, s6_0ImmPred:$src2),
- (S4_clbaddi IntRegs:$src1, s6_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2),
+ (S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_vsxtbh IntRegs:$src1),
@@ -1589,16 +1589,16 @@ def: Pat<(int_hexagon_M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$sr
(M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_addi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S4_addi_asl_ri u32_0ImmPred:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpheqi IntRegs:$src1, s32_0ImmPred:$src2),
- (A4_cmpheqi IntRegs:$src1, s32_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+ (A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
(S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -1623,8 +1623,8 @@ def: Pat<(int_hexagon_M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntReg
(M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_round_ri_sat IntRegs:$src1, u5_0ImmPred:$src2),
- (A4_round_ri_sat IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
@@ -1637,10 +1637,10 @@ def: Pat<(int_hexagon_M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRe
(M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_setbit_i IntRegs:$src1, u5_0ImmPred:$src2),
- (S2_setbit_i IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_andn IntRegs:$src1, IntRegs:$src2),
(A4_andn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
@@ -1655,8 +1655,8 @@ def: Pat<(int_hexagon_C2_bitsclr IntRegs:$src1, IntRegs:$src2),
(C2_bitsclr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred:$src2),
- (A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
+ (A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_A4_ornp DoubleRegs:$src1, DoubleRegs:$src2),
(A4_ornp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_C4_and_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
@@ -1673,14 +1673,14 @@ def: Pat<(int_hexagon_M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2),
(M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
def: Pat<(int_hexagon_M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2),
(M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_nbitsclri IntRegs:$src1, u6_0ImmPred:$src2),
- (C4_nbitsclri IntRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2),
- (S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2),
+ (C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+ (S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
// V55 Scalar Instructions.
@@ -1689,30 +1689,30 @@ def: Pat<(int_hexagon_A5_ACS DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src
// V60 Scalar Instructions.
-def: Pat<(int_hexagon_S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred:$src2),
- (S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred:$src2)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r IntRegs:$src1, u5_0ImmPred:$src2),
- (S6_rol_i_r IntRegs:$src1, u5_0ImmPred:$src2)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3),
- (S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3),
- (S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+ (S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+ (S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+ (S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+ (S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
// V62 Scalar Instructions.
@@ -1744,8 +1744,8 @@ def: Pat<(int_hexagon_F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2),
(F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>;
def: Pat<(int_hexagon_M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
(M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV66]>;
-def: Pat<(int_hexagon_S2_mask u5_0ImmPred:$src1, u5_0ImmPred:$src2),
- (S2_mask u5_0ImmPred:$src1, u5_0ImmPred:$src2)>, Requires<[HasV66]>;
+def: Pat<(int_hexagon_S2_mask u5_0ImmPred_timm:$src1, u5_0ImmPred_timm:$src2),
+ (S2_mask u5_0ImmPred_timm:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV66]>;
// V60 HVX Instructions.
@@ -1773,10 +1773,10 @@ def: Pat<(int_hexagon_V6_vaddh_dv HvxWR:$src1, HvxWR:$src2),
(V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vaddh_dv_128B HvxWR:$src1, HvxWR:$src2),
(V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
- (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybusi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
- (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+ (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+ (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vshufoh HvxVR:$src1, HvxVR:$src2),
(V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vshufoh_128B HvxVR:$src1, HvxVR:$src2),
@@ -1789,10 +1789,10 @@ def: Pat<(int_hexagon_V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2),
(V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vdmpyhsuisat_128B HvxWR:$src1, IntRegs:$src2),
(V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
- (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrsadubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
- (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+ (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+ (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vnavgw HvxVR:$src1, HvxVR:$src2),
(V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vnavgw_128B HvxVR:$src1, HvxVR:$src2),
@@ -2369,10 +2369,10 @@ def: Pat<(int_hexagon_V6_vsubhsat HvxVR:$src1, HvxVR:$src2),
(V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vsubhsat_128B HvxVR:$src1, HvxVR:$src2),
(V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
- (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
- (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+ (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+ (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vabsw HvxVR:$src1),
(V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vabsw_128B HvxVR:$src1),
@@ -2489,10 +2489,10 @@ def: Pat<(int_hexagon_V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
(V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vmpybv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
(V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
- (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrsadubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
- (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+ (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+ (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
(V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
@@ -2677,10 +2677,10 @@ def: Pat<(int_hexagon_V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
(V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vaddbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
(V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
- (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlalignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
- (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlalignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vsatwh HvxVR:$src1, HvxVR:$src2),
(V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vsatwh_128B HvxVR:$src1, HvxVR:$src2),
@@ -2721,10 +2721,10 @@ def: Pat<(int_hexagon_V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
(V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_veqh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
(V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
- (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_valignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
- (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_valignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vaddwsat HvxVR:$src1, HvxVR:$src2),
(V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vaddwsat_128B HvxVR:$src1, HvxVR:$src2),
@@ -2885,10 +2885,10 @@ def: Pat<(int_hexagon_V6_vsubh HvxVR:$src1, HvxVR:$src2),
(V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vsubh_128B HvxVR:$src1, HvxVR:$src2),
(V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
- (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3),
- (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+ (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+ (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vminw HvxVR:$src1, HvxVR:$src2),
(V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vminw_128B HvxVR:$src1, HvxVR:$src2),
@@ -2929,10 +2929,10 @@ def: Pat<(int_hexagon_V6_vsubuhw HvxVR:$src1, HvxVR:$src2),
(V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vsubuhw_128B HvxVR:$src1, HvxVR:$src2),
(V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
- (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybusi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4),
- (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+ (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+ (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vasrw HvxVR:$src1, IntRegs:$src2),
(V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vasrw_128B HvxVR:$src1, IntRegs:$src2),
@@ -3016,10 +3016,10 @@ def: Pat<(int_hexagon_V6_vlsrb HvxVR:$src1, IntRegs:$src2),
(V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vlsrb_128B HvxVR:$src1, IntRegs:$src2),
(V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
- (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvwhi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
- (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwhi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2),
(V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vaddububb_sat_128B HvxVR:$src1, HvxVR:$src2),
@@ -3032,10 +3032,10 @@ def: Pat<(int_hexagon_V6_ldtp0 PredRegs:$src1, IntRegs:$src2),
(V6_ldtp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_ldtp0_128B PredRegs:$src1, IntRegs:$src2),
(V6_ldtp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4),
- (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_oracci_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4),
- (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+ (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracci_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+ (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2),
(V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vsubuwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
@@ -3124,10 +3124,10 @@ def: Pat<(int_hexagon_V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$sr
(V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vasrwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
(V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
- (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3),
- (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+ (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vsubuwsat HvxVR:$src1, HvxVR:$src2),
(V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vsubuwsat_128B HvxVR:$src1, HvxVR:$src2),
@@ -3188,10 +3188,10 @@ def: Pat<(int_hexagon_V6_ldcnp0 PredRegs:$src1, IntRegs:$src2),
(V6_ldcnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_ldcnp0_128B PredRegs:$src1, IntRegs:$src2),
(V6_ldcnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4),
- (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_oracci_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4),
- (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred:$src4)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+ (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_oracci_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+ (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>;
def: Pat<(int_hexagon_V6_vsubbsat HvxVR:$src1, HvxVR:$src2),
(V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vsubbsat_128B HvxVR:$src1, HvxVR:$src2),
diff --git a/lib/Target/Hexagon/HexagonDepOperands.td b/lib/Target/Hexagon/HexagonDepOperands.td
index fdba7b971258..8a94d96522cc 100644
--- a/lib/Target/Hexagon/HexagonDepOperands.td
+++ b/lib/Target/Hexagon/HexagonDepOperands.td
@@ -8,120 +8,125 @@
// Automatically generated file, please consult code owner before editing.
//===----------------------------------------------------------------------===//
+multiclass ImmOpPred<code pred, ValueType vt = i32> {
+ def "" : PatLeaf<(vt imm), pred>;
+ def _timm : PatLeaf<(vt timm), pred>;
+}
+
def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; let RenderMethod = "addSignedImmOperands"; }
def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; let DecoderMethod = "s4_0ImmDecoder"; }
-def s4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 0>(N->getSExtValue());}]>;
+defm s4_0ImmPred : ImmOpPred<[{ return isShiftedInt<4, 0>(N->getSExtValue());}]>;
def s29_3ImmOperand : AsmOperandClass { let Name = "s29_3Imm"; let RenderMethod = "addSignedImmOperands"; }
def s29_3Imm : Operand<i32> { let ParserMatchClass = s29_3ImmOperand; let DecoderMethod = "s29_3ImmDecoder"; }
-def s29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 3>(N->getSExtValue());}]>;
+defm s29_3ImmPred : ImmOpPred<[{ return isShiftedInt<32, 3>(N->getSExtValue());}]>;
def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; let RenderMethod = "addImmOperands"; }
def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>;
+defm u6_0ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>;
def a30_2ImmOperand : AsmOperandClass { let Name = "a30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
def a30_2Imm : Operand<i32> { let ParserMatchClass = a30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-def a30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+defm a30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
def u29_3ImmOperand : AsmOperandClass { let Name = "u29_3Imm"; let RenderMethod = "addImmOperands"; }
def u29_3Imm : Operand<i32> { let ParserMatchClass = u29_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>;
+defm u29_3ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>;
def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; let RenderMethod = "addSignedImmOperands"; }
def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand; let DecoderMethod = "s8_0ImmDecoder"; }
-def s8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<8, 0>(N->getSExtValue());}]>;
+defm s8_0ImmPred : ImmOpPred<[{ return isShiftedInt<8, 0>(N->getSExtValue());}]>;
def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; let RenderMethod = "addImmOperands"; }
def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 0>(N->getSExtValue());}]>;
+defm u32_0ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 0>(N->getSExtValue());}]>;
def u4_2ImmOperand : AsmOperandClass { let Name = "u4_2Imm"; let RenderMethod = "addImmOperands"; }
def u4_2Imm : Operand<i32> { let ParserMatchClass = u4_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u4_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>;
+defm u4_2ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>;
def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; let RenderMethod = "addImmOperands"; }
def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u3_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>;
+defm u3_0ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>;
def b15_2ImmOperand : AsmOperandClass { let Name = "b15_2Imm"; let RenderMethod = "addSignedImmOperands"; }
def b15_2Imm : Operand<OtherVT> { let ParserMatchClass = b15_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-def b15_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<15, 2>(N->getSExtValue());}]>;
+defm b15_2ImmPred : ImmOpPred<[{ return isShiftedInt<15, 2>(N->getSExtValue());}]>;
def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; let RenderMethod = "addImmOperands"; }
def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u11_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>;
+defm u11_3ImmPred : ImmOpPred<[{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>;
def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; let RenderMethod = "addSignedImmOperands"; }
def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand; let DecoderMethod = "s4_3ImmDecoder"; }
-def s4_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 3>(N->getSExtValue());}]>;
+defm s4_3ImmPred : ImmOpPred<[{ return isShiftedInt<4, 3>(N->getSExtValue());}]>;
def m32_0ImmOperand : AsmOperandClass { let Name = "m32_0Imm"; let RenderMethod = "addImmOperands"; }
def m32_0Imm : Operand<i32> { let ParserMatchClass = m32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def m32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
+defm m32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
def u3_1ImmOperand : AsmOperandClass { let Name = "u3_1Imm"; let RenderMethod = "addImmOperands"; }
def u3_1Imm : Operand<i32> { let ParserMatchClass = u3_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u3_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>;
+defm u3_1ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>;
def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; let RenderMethod = "addImmOperands"; }
def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u1_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<1, 0>(N->getSExtValue());}]>;
+defm u1_0ImmPred : ImmOpPred<[{ return isShiftedUInt<1, 0>(N->getSExtValue());}]>;
def s31_1ImmOperand : AsmOperandClass { let Name = "s31_1Imm"; let RenderMethod = "addSignedImmOperands"; }
def s31_1Imm : Operand<i32> { let ParserMatchClass = s31_1ImmOperand; let DecoderMethod = "s31_1ImmDecoder"; }
-def s31_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 1>(N->getSExtValue());}]>;
+defm s31_1ImmPred : ImmOpPred<[{ return isShiftedInt<32, 1>(N->getSExtValue());}]>;
def s3_0ImmOperand : AsmOperandClass { let Name = "s3_0Imm"; let RenderMethod = "addSignedImmOperands"; }
def s3_0Imm : Operand<i32> { let ParserMatchClass = s3_0ImmOperand; let DecoderMethod = "s3_0ImmDecoder"; }
-def s3_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<3, 0>(N->getSExtValue());}]>;
+defm s3_0ImmPred : ImmOpPred<[{ return isShiftedInt<3, 0>(N->getSExtValue());}]>;
def s30_2ImmOperand : AsmOperandClass { let Name = "s30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
def s30_2Imm : Operand<i32> { let ParserMatchClass = s30_2ImmOperand; let DecoderMethod = "s30_2ImmDecoder"; }
-def s30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+defm s30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; let RenderMethod = "addImmOperands"; }
def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<4, 0>(N->getSExtValue());}]>;
+defm u4_0ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 0>(N->getSExtValue());}]>;
def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; let RenderMethod = "addSignedImmOperands"; }
def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand; let DecoderMethod = "s6_0ImmDecoder"; }
-def s6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<6, 0>(N->getSExtValue());}]>;
+defm s6_0ImmPred : ImmOpPred<[{ return isShiftedInt<6, 0>(N->getSExtValue());}]>;
def u5_3ImmOperand : AsmOperandClass { let Name = "u5_3Imm"; let RenderMethod = "addImmOperands"; }
def u5_3Imm : Operand<i32> { let ParserMatchClass = u5_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u5_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>;
+defm u5_3ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>;
def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; let RenderMethod = "addSignedImmOperands"; }
def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand; let DecoderMethod = "s32_0ImmDecoder"; }
-def s32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
+defm s32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
def s6_3ImmOperand : AsmOperandClass { let Name = "s6_3Imm"; let RenderMethod = "addSignedImmOperands"; }
def s6_3Imm : Operand<i32> { let ParserMatchClass = s6_3ImmOperand; let DecoderMethod = "s6_3ImmDecoder"; }
-def s6_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<6, 3>(N->getSExtValue());}]>;
+defm s6_3ImmPred : ImmOpPred<[{ return isShiftedInt<6, 3>(N->getSExtValue());}]>;
def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; let RenderMethod = "addImmOperands"; }
def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u10_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<10, 0>(N->getSExtValue());}]>;
+defm u10_0ImmPred : ImmOpPred<[{ return isShiftedUInt<10, 0>(N->getSExtValue());}]>;
def u31_1ImmOperand : AsmOperandClass { let Name = "u31_1Imm"; let RenderMethod = "addImmOperands"; }
def u31_1Imm : Operand<i32> { let ParserMatchClass = u31_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u31_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>;
+defm u31_1ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>;
def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; let RenderMethod = "addSignedImmOperands"; }
def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand; let DecoderMethod = "s4_1ImmDecoder"; }
-def s4_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 1>(N->getSExtValue());}]>;
+defm s4_1ImmPred : ImmOpPred<[{ return isShiftedInt<4, 1>(N->getSExtValue());}]>;
def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; let RenderMethod = "addImmOperands"; }
def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u16_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>;
+defm u16_0ImmPred : ImmOpPred<[{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>;
def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; let RenderMethod = "addImmOperands"; }
def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u6_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 1>(N->getSExtValue());}]>;
+defm u6_1ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 1>(N->getSExtValue());}]>;
def u5_2ImmOperand : AsmOperandClass { let Name = "u5_2Imm"; let RenderMethod = "addImmOperands"; }
def u5_2Imm : Operand<i32> { let ParserMatchClass = u5_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u5_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 2>(N->getSExtValue());}]>;
+defm u5_2ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 2>(N->getSExtValue());}]>;
def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; let RenderMethod = "addImmOperands"; }
def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u26_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>;
+defm u26_6ImmPred : ImmOpPred<[{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>;
def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; let RenderMethod = "addImmOperands"; }
def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u6_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>;
+defm u6_2ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>;
def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; let RenderMethod = "addImmOperands"; }
def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u7_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>;
+defm u7_0ImmPred : ImmOpPred<[{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>;
def b13_2ImmOperand : AsmOperandClass { let Name = "b13_2Imm"; let RenderMethod = "addSignedImmOperands"; }
def b13_2Imm : Operand<OtherVT> { let ParserMatchClass = b13_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-def b13_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<13, 2>(N->getSExtValue());}]>;
+defm b13_2ImmPred : ImmOpPred<[{ return isShiftedInt<13, 2>(N->getSExtValue());}]>;
def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; let RenderMethod = "addImmOperands"; }
def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u5_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>;
+defm u5_0ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>;
def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; let RenderMethod = "addImmOperands"; }
def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u2_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>;
+defm u2_0ImmPred : ImmOpPred<[{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>;
def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; let RenderMethod = "addSignedImmOperands"; }
def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand; let DecoderMethod = "s4_2ImmDecoder"; }
-def s4_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 2>(N->getSExtValue());}]>;
+defm s4_2ImmPred : ImmOpPred<[{ return isShiftedInt<4, 2>(N->getSExtValue());}]>;
def b30_2ImmOperand : AsmOperandClass { let Name = "b30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
def b30_2Imm : Operand<OtherVT> { let ParserMatchClass = b30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-def b30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+defm b30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; let RenderMethod = "addImmOperands"; }
def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>;
+defm u8_0ImmPred : ImmOpPred<[{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>;
def u30_2ImmOperand : AsmOperandClass { let Name = "u30_2Imm"; let RenderMethod = "addImmOperands"; }
def u30_2Imm : Operand<i32> { let ParserMatchClass = u30_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-def u30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>;
+defm u30_2ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>;
diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index c1f32e54e98d..0844fb8a8629 100644
--- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -250,7 +250,7 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
unsigned Opc = T1I->getOpcode();
if (Opc != Hexagon::J2_jumpt && Opc != Hexagon::J2_jumpf)
return false;
- unsigned PredR = T1I->getOperand(0).getReg();
+ Register PredR = T1I->getOperand(0).getReg();
// Get the layout successor, or 0 if B does not have one.
MachineFunction::iterator NextBI = std::next(MachineFunction::iterator(B));
@@ -384,8 +384,8 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || !MO.isDef())
continue;
- unsigned R = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ Register R = MO.getReg();
+ if (!Register::isVirtualRegister(R))
continue;
if (!isPredicate(R))
continue;
@@ -401,8 +401,8 @@ bool HexagonEarlyIfConversion::usesUndefVReg(const MachineInstr *MI) const {
for (const MachineOperand &MO : MI->operands()) {
if (!MO.isReg() || !MO.isUse())
continue;
- unsigned R = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ Register R = MO.getReg();
+ if (!Register::isVirtualRegister(R))
continue;
const MachineInstr *DefI = MRI->getVRegDef(R);
// "Undefined" virtual registers are actually defined via IMPLICIT_DEF.
@@ -437,7 +437,7 @@ bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const {
break;
if (usesUndefVReg(&MI))
return false;
- unsigned DefR = MI.getOperand(0).getReg();
+ Register DefR = MI.getOperand(0).getReg();
if (isPredicate(DefR))
return false;
}
@@ -491,8 +491,8 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs(
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || !MO.isDef())
continue;
- unsigned R = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ Register R = MO.getReg();
+ if (!Register::isVirtualRegister(R))
continue;
if (isPredicate(R))
PredDefs++;
@@ -798,7 +798,7 @@ unsigned HexagonEarlyIfConversion::buildMux(MachineBasicBlock *B,
const MCInstrDesc &D = HII->get(Opc);
DebugLoc DL = B->findBranchDebugLoc();
- unsigned MuxR = MRI->createVirtualRegister(DRC);
+ Register MuxR = MRI->createVirtualRegister(DRC);
BuildMI(*B, At, DL, D, MuxR)
.addReg(PredR)
.addReg(TR, 0, TSR)
@@ -837,7 +837,7 @@ void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
unsigned MuxR = 0, MuxSR = 0;
if (TR && FR) {
- unsigned DR = PN->getOperand(0).getReg();
+ Register DR = PN->getOperand(0).getReg();
const TargetRegisterClass *RC = MRI->getRegClass(DR);
MuxR = buildMux(FP.SplitB, FP.SplitB->getFirstTerminator(), RC,
FP.PredR, TR, TSR, FR, FSR);
@@ -988,8 +988,8 @@ void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) {
MachineInstr *PN = &*I;
assert(PN->getNumOperands() == 3 && "Invalid phi node");
MachineOperand &UO = PN->getOperand(1);
- unsigned UseR = UO.getReg(), UseSR = UO.getSubReg();
- unsigned DefR = PN->getOperand(0).getReg();
+ Register UseR = UO.getReg(), UseSR = UO.getSubReg();
+ Register DefR = PN->getOperand(0).getReg();
unsigned NewR = UseR;
if (UseSR) {
// MRI.replaceVregUsesWith does not allow to update the subregister,
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index c343e426ac7d..8984ee82960d 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -285,7 +285,7 @@ bool HexagonExpandCondsets::isCondset(const MachineInstr &MI) {
}
LaneBitmask HexagonExpandCondsets::getLaneMask(unsigned Reg, unsigned Sub) {
- assert(TargetRegisterInfo::isVirtualRegister(Reg));
+ assert(Register::isVirtualRegister(Reg));
return Sub != 0 ? TRI->getSubRegIndexLaneMask(Sub)
: MRI->getMaxLaneMaskForVReg(Reg);
}
@@ -364,7 +364,7 @@ void HexagonExpandCondsets::updateKillFlags(unsigned Reg) {
void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
LiveRange &Range) {
- assert(TargetRegisterInfo::isVirtualRegister(Reg));
+ assert(Register::isVirtualRegister(Reg));
if (Range.empty())
return;
@@ -372,8 +372,8 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
auto IsRegDef = [this,Reg,LM] (MachineOperand &Op) -> std::pair<bool,bool> {
if (!Op.isReg() || !Op.isDef())
return { false, false };
- unsigned DR = Op.getReg(), DSR = Op.getSubReg();
- if (!TargetRegisterInfo::isVirtualRegister(DR) || DR != Reg)
+ Register DR = Op.getReg(), DSR = Op.getSubReg();
+ if (!Register::isVirtualRegister(DR) || DR != Reg)
return { false, false };
LaneBitmask SLM = getLaneMask(DR, DSR);
LaneBitmask A = SLM & LM;
@@ -551,8 +551,8 @@ void HexagonExpandCondsets::updateLiveness(std::set<unsigned> &RegSet,
bool Recalc, bool UpdateKills, bool UpdateDeads) {
UpdateKills |= UpdateDeads;
for (unsigned R : RegSet) {
- if (!TargetRegisterInfo::isVirtualRegister(R)) {
- assert(TargetRegisterInfo::isPhysicalRegister(R));
+ if (!Register::isVirtualRegister(R)) {
+ assert(Register::isPhysicalRegister(R));
// There shouldn't be any physical registers as operands, except
// possibly reserved registers.
assert(MRI->isReserved(R));
@@ -579,17 +579,17 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
using namespace Hexagon;
if (SO.isReg()) {
- unsigned PhysR;
+ Register PhysR;
RegisterRef RS = SO;
- if (TargetRegisterInfo::isVirtualRegister(RS.Reg)) {
+ if (Register::isVirtualRegister(RS.Reg)) {
const TargetRegisterClass *VC = MRI->getRegClass(RS.Reg);
assert(VC->begin() != VC->end() && "Empty register class");
PhysR = *VC->begin();
} else {
- assert(TargetRegisterInfo::isPhysicalRegister(RS.Reg));
+ assert(Register::isPhysicalRegister(RS.Reg));
PhysR = RS.Reg;
}
- unsigned PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub);
+ Register PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub);
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS);
switch (TRI->getRegSizeInBits(*RC)) {
case 32:
@@ -671,7 +671,7 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
MachineOperand &MD = MI.getOperand(0); // Definition
MachineOperand &MP = MI.getOperand(1); // Predicate register
assert(MD.isDef());
- unsigned DR = MD.getReg(), DSR = MD.getSubReg();
+ Register DR = MD.getReg(), DSR = MD.getSubReg();
bool ReadUndef = MD.isUndef();
MachineBasicBlock::iterator At = MI;
@@ -802,7 +802,7 @@ bool HexagonExpandCondsets::canMoveOver(MachineInstr &MI, ReferenceMap &Defs,
// For physical register we would need to check register aliases, etc.
// and we don't want to bother with that. It would be of little value
// before the actual register rewriting (from virtual to physical).
- if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+ if (!Register::isVirtualRegister(RR.Reg))
return false;
// No redefs for any operand.
if (isRefInMap(RR, Defs, Exec_Then))
@@ -954,7 +954,7 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
return false;
RegisterRef RT(MS);
- unsigned PredR = MP.getReg();
+ Register PredR = MP.getReg();
MachineInstr *DefI = getReachingDefForPred(RT, TfrI, PredR, Cond);
if (!DefI || !isPredicable(DefI))
return false;
@@ -999,7 +999,7 @@ bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
// subregisters are other physical registers, and we are not checking
// that.
RegisterRef RR = Op;
- if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+ if (!Register::isVirtualRegister(RR.Reg))
return false;
ReferenceMap &Map = Op.isDef() ? Defs : Uses;
@@ -1091,7 +1091,7 @@ bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B,
}
bool HexagonExpandCondsets::isIntReg(RegisterRef RR, unsigned &BW) {
- if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+ if (!Register::isVirtualRegister(RR.Reg))
return false;
const TargetRegisterClass *RC = MRI->getRegClass(RR.Reg);
if (RC == &Hexagon::IntRegsRegClass) {
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index f7edc168de4a..d21de8ccb5ab 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -114,12 +114,11 @@ bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
// First pass - compute the offset of each basic block.
for (const MachineBasicBlock &MBB : MF) {
- if (MBB.getAlignment()) {
+ if (MBB.getAlignment() != Align::None()) {
// Although we don't know the exact layout of the final code, we need
// to account for alignment padding somehow. This heuristic pads each
// aligned basic block according to the alignment value.
- int ByteAlign = (1u << MBB.getAlignment()) - 1;
- InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign);
+ InstOffset = alignTo(InstOffset, MBB.getAlignment());
}
BlockToInstOffset[&MBB] = InstOffset;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 3368ee4fb3b9..bfa3372d7faf 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -303,10 +303,10 @@ static bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR,
if (MO.isFI())
return true;
if (MO.isReg()) {
- unsigned R = MO.getReg();
+ Register R = MO.getReg();
// Virtual registers will need scavenging, which then may require
// a stack slot.
- if (TargetRegisterInfo::isVirtualRegister(R))
+ if (Register::isVirtualRegister(R))
return true;
for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
if (CSR[*S])
@@ -973,8 +973,8 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
// understand paired registers for cfi_offset.
// Eg .cfi_offset r1:0, -64
- unsigned HiReg = HRI.getSubReg(Reg, Hexagon::isub_hi);
- unsigned LoReg = HRI.getSubReg(Reg, Hexagon::isub_lo);
+ Register HiReg = HRI.getSubReg(Reg, Hexagon::isub_hi);
+ Register LoReg = HRI.getSubReg(Reg, Hexagon::isub_lo);
unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true);
unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true);
auto OffHi = MCCFIInstruction::createOffset(FrameLabel, HiDwarfReg,
@@ -1377,10 +1377,10 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized(
}
MFI.setLocalFrameSize(LFS);
- unsigned A = MFI.getLocalFrameMaxAlign();
+ Align A = MFI.getLocalFrameMaxAlign();
assert(A <= 8 && "Unexpected local frame alignment");
- if (A == 0)
- MFI.setLocalFrameMaxAlign(8);
+ if (A == 1)
+ MFI.setLocalFrameMaxAlign(Align(8));
MFI.setUseLocalStackAllocationBlock(true);
// Set the physical aligned-stack base address register.
@@ -1570,13 +1570,13 @@ bool HexagonFrameLowering::expandCopy(MachineBasicBlock &B,
const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
MachineInstr *MI = &*It;
DebugLoc DL = MI->getDebugLoc();
- unsigned DstR = MI->getOperand(0).getReg();
- unsigned SrcR = MI->getOperand(1).getReg();
+ Register DstR = MI->getOperand(0).getReg();
+ Register SrcR = MI->getOperand(1).getReg();
if (!Hexagon::ModRegsRegClass.contains(DstR) ||
!Hexagon::ModRegsRegClass.contains(SrcR))
return false;
- unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), TmpR).add(MI->getOperand(1));
BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), DstR)
.addReg(TmpR, RegState::Kill);
@@ -1595,13 +1595,13 @@ bool HexagonFrameLowering::expandStoreInt(MachineBasicBlock &B,
DebugLoc DL = MI->getDebugLoc();
unsigned Opc = MI->getOpcode();
- unsigned SrcR = MI->getOperand(2).getReg();
+ Register SrcR = MI->getOperand(2).getReg();
bool IsKill = MI->getOperand(2).isKill();
int FI = MI->getOperand(0).getIndex();
// TmpR = C2_tfrpr SrcR if SrcR is a predicate register
// TmpR = A2_tfrcrr SrcR if SrcR is a modifier register
- unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
unsigned TfrOpc = (Opc == Hexagon::STriw_pred) ? Hexagon::C2_tfrpr
: Hexagon::A2_tfrcrr;
BuildMI(B, It, DL, HII.get(TfrOpc), TmpR)
@@ -1628,11 +1628,11 @@ bool HexagonFrameLowering::expandLoadInt(MachineBasicBlock &B,
DebugLoc DL = MI->getDebugLoc();
unsigned Opc = MI->getOpcode();
- unsigned DstR = MI->getOperand(0).getReg();
+ Register DstR = MI->getOperand(0).getReg();
int FI = MI->getOperand(1).getIndex();
// TmpR = L2_loadri_io FI, 0
- unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
BuildMI(B, It, DL, HII.get(Hexagon::L2_loadri_io), TmpR)
.addFrameIndex(FI)
.addImm(0)
@@ -1658,7 +1658,7 @@ bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B,
return false;
DebugLoc DL = MI->getDebugLoc();
- unsigned SrcR = MI->getOperand(2).getReg();
+ Register SrcR = MI->getOperand(2).getReg();
bool IsKill = MI->getOperand(2).isKill();
int FI = MI->getOperand(0).getIndex();
auto *RC = &Hexagon::HvxVRRegClass;
@@ -1667,8 +1667,8 @@ bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B,
// TmpR0 = A2_tfrsi 0x01010101
// TmpR1 = V6_vandqrt Qx, TmpR0
// store FI, 0, TmpR1
- unsigned TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
- unsigned TmpR1 = MRI.createVirtualRegister(RC);
+ Register TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register TmpR1 = MRI.createVirtualRegister(RC);
BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
.addImm(0x01010101);
@@ -1695,15 +1695,15 @@ bool HexagonFrameLowering::expandLoadVecPred(MachineBasicBlock &B,
return false;
DebugLoc DL = MI->getDebugLoc();
- unsigned DstR = MI->getOperand(0).getReg();
+ Register DstR = MI->getOperand(0).getReg();
int FI = MI->getOperand(1).getIndex();
auto *RC = &Hexagon::HvxVRRegClass;
// TmpR0 = A2_tfrsi 0x01010101
// TmpR1 = load FI, 0
// DstR = V6_vandvrt TmpR1, TmpR0
- unsigned TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
- unsigned TmpR1 = MRI.createVirtualRegister(RC);
+ Register TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register TmpR1 = MRI.createVirtualRegister(RC);
BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
.addImm(0x01010101);
@@ -1745,9 +1745,9 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
}
DebugLoc DL = MI->getDebugLoc();
- unsigned SrcR = MI->getOperand(2).getReg();
- unsigned SrcLo = HRI.getSubReg(SrcR, Hexagon::vsub_lo);
- unsigned SrcHi = HRI.getSubReg(SrcR, Hexagon::vsub_hi);
+ Register SrcR = MI->getOperand(2).getReg();
+ Register SrcLo = HRI.getSubReg(SrcR, Hexagon::vsub_lo);
+ Register SrcHi = HRI.getSubReg(SrcR, Hexagon::vsub_hi);
bool IsKill = MI->getOperand(2).isKill();
int FI = MI->getOperand(0).getIndex();
@@ -1793,9 +1793,9 @@ bool HexagonFrameLowering::expandLoadVec2(MachineBasicBlock &B,
return false;
DebugLoc DL = MI->getDebugLoc();
- unsigned DstR = MI->getOperand(0).getReg();
- unsigned DstHi = HRI.getSubReg(DstR, Hexagon::vsub_hi);
- unsigned DstLo = HRI.getSubReg(DstR, Hexagon::vsub_lo);
+ Register DstR = MI->getOperand(0).getReg();
+ Register DstHi = HRI.getSubReg(DstR, Hexagon::vsub_hi);
+ Register DstLo = HRI.getSubReg(DstR, Hexagon::vsub_lo);
int FI = MI->getOperand(1).getIndex();
unsigned Size = HRI.getSpillSize(Hexagon::HvxVRRegClass);
@@ -1834,7 +1834,7 @@ bool HexagonFrameLowering::expandStoreVec(MachineBasicBlock &B,
auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
DebugLoc DL = MI->getDebugLoc();
- unsigned SrcR = MI->getOperand(2).getReg();
+ Register SrcR = MI->getOperand(2).getReg();
bool IsKill = MI->getOperand(2).isKill();
int FI = MI->getOperand(0).getIndex();
@@ -1863,7 +1863,7 @@ bool HexagonFrameLowering::expandLoadVec(MachineBasicBlock &B,
auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
DebugLoc DL = MI->getDebugLoc();
- unsigned DstR = MI->getOperand(0).getReg();
+ Register DstR = MI->getOperand(0).getReg();
int FI = MI->getOperand(1).getIndex();
unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
@@ -2299,7 +2299,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
int TFI;
if (!HII.isLoadFromStackSlot(MI, TFI) || TFI != FI)
continue;
- unsigned DstR = MI.getOperand(0).getReg();
+ Register DstR = MI.getOperand(0).getReg();
assert(MI.getOperand(0).getSubReg() == 0);
MachineInstr *CopyOut = nullptr;
if (DstR != FoundR) {
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index 65e8c7686640..27265dd53794 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -30,7 +30,7 @@ class TargetRegisterClass;
class HexagonFrameLowering : public TargetFrameLowering {
public:
explicit HexagonFrameLowering()
- : TargetFrameLowering(StackGrowsDown, 8, 0, 1, true) {}
+ : TargetFrameLowering(StackGrowsDown, Align(8), 0, Align::None(), true) {}
// All of the prolog/epilog functionality, including saving and restoring
// callee-saved registers is handled in emitPrologue. This is to have the
diff --git a/lib/Target/Hexagon/HexagonGenExtract.cpp b/lib/Target/Hexagon/HexagonGenExtract.cpp
index 3417c74e359b..caa0e4d80397 100644
--- a/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -184,7 +184,7 @@ bool HexagonGenExtract::convert(Instruction *In) {
// The width of the extracted field is the minimum of the original bits
// that remain after the shifts and the number of contiguous 1s in the mask.
uint32_t W = std::min(U, T);
- if (W == 0)
+ if (W == 0 || W == 1)
return false;
// Check if the extracted bits are contained within the mask that it is
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
index 81025c1c5325..48881e02f4d3 100644
--- a/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -163,11 +163,11 @@ namespace {
}
static inline unsigned v2x(unsigned v) {
- return TargetRegisterInfo::virtReg2Index(v);
+ return Register::virtReg2Index(v);
}
static inline unsigned x2v(unsigned x) {
- return TargetRegisterInfo::index2VirtReg(x);
+ return Register::index2VirtReg(x);
}
};
@@ -267,7 +267,7 @@ namespace {
CellMapShadow(const BitTracker &T) : BT(T) {}
const BitTracker::RegisterCell &lookup(unsigned VR) {
- unsigned RInd = TargetRegisterInfo::virtReg2Index(VR);
+ unsigned RInd = Register::virtReg2Index(VR);
// Grow the vector to at least 32 elements.
if (RInd >= CVect.size())
CVect.resize(std::max(RInd+16, 32U), nullptr);
@@ -606,9 +606,9 @@ void HexagonGenInsert::buildOrderingMF(RegisterOrdering &RO) const {
for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
const MachineOperand &MO = MI->getOperand(i);
if (MO.isReg() && MO.isDef()) {
- unsigned R = MO.getReg();
+ Register R = MO.getReg();
assert(MO.getSubReg() == 0 && "Unexpected subregister in definition");
- if (TargetRegisterInfo::isVirtualRegister(R))
+ if (Register::isVirtualRegister(R))
RO.insert(std::make_pair(R, Index++));
}
}
@@ -724,8 +724,8 @@ void HexagonGenInsert::getInstrDefs(const MachineInstr *MI,
const MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg() || !MO.isDef())
continue;
- unsigned R = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ Register R = MO.getReg();
+ if (!Register::isVirtualRegister(R))
continue;
Defs.insert(R);
}
@@ -737,8 +737,8 @@ void HexagonGenInsert::getInstrUses(const MachineInstr *MI,
const MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg() || !MO.isUse())
continue;
- unsigned R = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ Register R = MO.getReg();
+ if (!Register::isVirtualRegister(R))
continue;
Uses.insert(R);
}
@@ -1399,7 +1399,7 @@ bool HexagonGenInsert::generateInserts() {
for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
unsigned VR = I->first;
const TargetRegisterClass *RC = MRI->getRegClass(VR);
- unsigned NewVR = MRI->createVirtualRegister(RC);
+ Register NewVR = MRI->createVirtualRegister(RC);
RegMap[VR] = NewVR;
}
@@ -1477,9 +1477,8 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
for (const MachineOperand &MO : MI->operands()) {
if (!MO.isReg() || !MO.isDef())
continue;
- unsigned R = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R) ||
- !MRI->use_nodbg_empty(R)) {
+ Register R = MO.getReg();
+ if (!Register::isVirtualRegister(R) || !MRI->use_nodbg_empty(R)) {
AllDead = false;
break;
}
@@ -1598,7 +1597,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
IterListType Out;
for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
- unsigned Idx = TargetRegisterInfo::virtReg2Index(I->first);
+ unsigned Idx = Register::virtReg2Index(I->first);
if (Idx >= Cutoff)
Out.push_back(I);
}
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
index cdafbc20ab86..b559e7bbbb60 100644
--- a/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -171,7 +171,7 @@ void HexagonGenMux::getDefsUses(const MachineInstr *MI, BitVector &Defs,
for (const MachineOperand &MO : MI->operands()) {
if (!MO.isReg() || MO.isImplicit())
continue;
- unsigned R = MO.getReg();
+ Register R = MO.getReg();
BitVector &Set = MO.isDef() ? Defs : Uses;
expandReg(R, Set);
}
@@ -239,14 +239,14 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
unsigned Opc = MI->getOpcode();
if (!isCondTransfer(Opc))
continue;
- unsigned DR = MI->getOperand(0).getReg();
+ Register DR = MI->getOperand(0).getReg();
if (isRegPair(DR))
continue;
MachineOperand &PredOp = MI->getOperand(1);
if (PredOp.isUndef())
continue;
- unsigned PR = PredOp.getReg();
+ Register PR = PredOp.getReg();
unsigned Idx = I2X.lookup(MI);
CondsetMap::iterator F = CM.find(DR);
bool IfTrue = HII->isPredicatedTrue(Opc);
diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp
index e991fa8b61c8..24d33c91a29b 100644
--- a/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -133,7 +133,7 @@ INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred",
"Hexagon generate predicate operations", false, false)
bool HexagonGenPredicate::isPredReg(unsigned R) {
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ if (!Register::isVirtualRegister(R))
return false;
const TargetRegisterClass *RC = MRI->getRegClass(R);
return RC == &Hexagon::PredRegsRegClass;
@@ -213,7 +213,7 @@ void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
case TargetOpcode::COPY:
if (isPredReg(MI->getOperand(1).getReg())) {
RegisterSubReg RD = MI->getOperand(0);
- if (TargetRegisterInfo::isVirtualRegister(RD.R))
+ if (Register::isVirtualRegister(RD.R))
PredGPRs.insert(RD);
}
break;
@@ -245,7 +245,7 @@ RegisterSubReg HexagonGenPredicate::getPredRegFor(const RegisterSubReg &Reg) {
// Create a predicate register for a given Reg. The newly created register
// will have its value copied from Reg, so that it can be later used as
// an operand in other instructions.
- assert(TargetRegisterInfo::isVirtualRegister(Reg.R));
+ assert(Register::isVirtualRegister(Reg.R));
RegToRegMap::iterator F = G2P.find(Reg);
if (F != G2P.end())
return F->second;
@@ -265,7 +265,7 @@ RegisterSubReg HexagonGenPredicate::getPredRegFor(const RegisterSubReg &Reg) {
MachineBasicBlock &B = *DefI->getParent();
DebugLoc DL = DefI->getDebugLoc();
const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
- unsigned NewPR = MRI->createVirtualRegister(PredRC);
+ Register NewPR = MRI->createVirtualRegister(PredRC);
// For convertible instructions, do not modify them, so that they can
// be converted later. Generate a copy from Reg to NewPR.
@@ -432,7 +432,7 @@ bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
// Generate a copy-out: NewGPR = NewPR, and replace all uses of OutR
// with NewGPR.
const TargetRegisterClass *RC = MRI->getRegClass(OutR.R);
- unsigned NewOutR = MRI->createVirtualRegister(RC);
+ Register NewOutR = MRI->createVirtualRegister(RC);
BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), NewOutR)
.addReg(NewPR.R, 0, NewPR.S);
MRI->replaceRegWith(OutR.R, NewOutR);
@@ -471,9 +471,9 @@ bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
continue;
RegisterSubReg DR = MI.getOperand(0);
RegisterSubReg SR = MI.getOperand(1);
- if (!TargetRegisterInfo::isVirtualRegister(DR.R))
+ if (!Register::isVirtualRegister(DR.R))
continue;
- if (!TargetRegisterInfo::isVirtualRegister(SR.R))
+ if (!Register::isVirtualRegister(SR.R))
continue;
if (MRI->getRegClass(DR.R) != PredRC)
continue;
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index cecbaedb6d70..62291790f0fe 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -435,17 +435,17 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
if (Phi->getOperand(i+1).getMBB() != Latch)
continue;
- unsigned PhiOpReg = Phi->getOperand(i).getReg();
+ Register PhiOpReg = Phi->getOperand(i).getReg();
MachineInstr *DI = MRI->getVRegDef(PhiOpReg);
if (DI->getDesc().isAdd()) {
// If the register operand to the add is the PHI we're looking at, this
// meets the induction pattern.
- unsigned IndReg = DI->getOperand(1).getReg();
+ Register IndReg = DI->getOperand(1).getReg();
MachineOperand &Opnd2 = DI->getOperand(2);
int64_t V;
if (MRI->getVRegDef(IndReg) == Phi && checkForImmediate(Opnd2, V)) {
- unsigned UpdReg = DI->getOperand(0).getReg();
+ Register UpdReg = DI->getOperand(0).getReg();
IndMap.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
}
}
@@ -694,7 +694,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
Cmp = Comparison::getSwappedComparison(Cmp);
if (InitialValue->isReg()) {
- unsigned R = InitialValue->getReg();
+ Register R = InitialValue->getReg();
MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
if (!MDT->properlyDominates(DefBB, Header)) {
int64_t V;
@@ -704,7 +704,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
OldInsts.push_back(MRI->getVRegDef(R));
}
if (EndValue->isReg()) {
- unsigned R = EndValue->getReg();
+ Register R = EndValue->getReg();
MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
if (!MDT->properlyDominates(DefBB, Header)) {
int64_t V;
@@ -910,7 +910,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
(RegToImm ? TII->get(Hexagon::A2_subri) :
TII->get(Hexagon::A2_addi));
if (RegToReg || RegToImm) {
- unsigned SubR = MRI->createVirtualRegister(IntRC);
+ Register SubR = MRI->createVirtualRegister(IntRC);
MachineInstrBuilder SubIB =
BuildMI(*PH, InsertPos, DL, SubD, SubR);
@@ -931,7 +931,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
EndValInstr->getOperand(2).getImm() == StartV) {
DistR = EndValInstr->getOperand(1).getReg();
} else {
- unsigned SubR = MRI->createVirtualRegister(IntRC);
+ Register SubR = MRI->createVirtualRegister(IntRC);
MachineInstrBuilder SubIB =
BuildMI(*PH, InsertPos, DL, SubD, SubR);
SubIB.addReg(End->getReg(), 0, End->getSubReg())
@@ -950,7 +950,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
AdjSR = DistSR;
} else {
// Generate CountR = ADD DistR, AdjVal
- unsigned AddR = MRI->createVirtualRegister(IntRC);
+ Register AddR = MRI->createVirtualRegister(IntRC);
MCInstrDesc const &AddD = TII->get(Hexagon::A2_addi);
BuildMI(*PH, InsertPos, DL, AddD, AddR)
.addReg(DistR, 0, DistSR)
@@ -971,7 +971,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
unsigned Shift = Log2_32(IVBump);
// Generate NormR = LSR DistR, Shift.
- unsigned LsrR = MRI->createVirtualRegister(IntRC);
+ Register LsrR = MRI->createVirtualRegister(IntRC);
const MCInstrDesc &LsrD = TII->get(Hexagon::S2_lsr_i_r);
BuildMI(*PH, InsertPos, DL, LsrD, LsrR)
.addReg(AdjR, 0, AdjSR)
@@ -1038,7 +1038,7 @@ bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
if (!MO.isReg() || !MO.isDef())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (MRI->use_nodbg_empty(Reg))
continue;
@@ -1058,7 +1058,7 @@ bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
if (!OPO.isReg() || !OPO.isDef())
continue;
- unsigned OPReg = OPO.getReg();
+ Register OPReg = OPO.getReg();
use_nodbg_iterator nextJ;
for (use_nodbg_iterator J = MRI->use_nodbg_begin(OPReg);
J != End; J = nextJ) {
@@ -1092,7 +1092,7 @@ void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
const MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg() || !MO.isDef())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
MachineRegisterInfo::use_iterator nextI;
for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
E = MRI->use_end(); I != E; I = nextI) {
@@ -1244,7 +1244,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L,
if (TripCount->isReg()) {
// Create a copy of the loop count register.
- unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
BuildMI(*Preheader, InsertPos, DL, TII->get(TargetOpcode::COPY), CountReg)
.addReg(TripCount->getReg(), 0, TripCount->getSubReg());
// Add the Loop instruction to the beginning of the loop.
@@ -1257,7 +1257,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L,
// create a new virtual register.
int64_t CountImm = TripCount->getImm();
if (!TII->isValidOffset(LOOP_i, CountImm, TRI)) {
- unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::A2_tfrsi), CountReg)
.addImm(CountImm);
BuildMI(*Preheader, InsertPos, DL, TII->get(LOOP_r))
@@ -1333,7 +1333,7 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
return true;
// Out of order.
- unsigned PredR = CmpI->getOperand(0).getReg();
+ Register PredR = CmpI->getOperand(0).getReg();
bool FoundBump = false;
instr_iterator CmpIt = CmpI->getIterator(), NextIt = std::next(CmpIt);
for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) {
@@ -1428,10 +1428,10 @@ bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow(
if (checkForImmediate(*InitVal, Imm))
return (EndVal->getImm() == Imm);
- unsigned Reg = InitVal->getReg();
+ Register Reg = InitVal->getReg();
// We don't know the value of a physical register.
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
return true;
MachineInstr *Def = MRI->getVRegDef(Reg);
@@ -1508,8 +1508,8 @@ bool HexagonHardwareLoops::checkForImmediate(const MachineOperand &MO,
// processed to handle potential subregisters in MO.
int64_t TV;
- unsigned R = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ Register R = MO.getReg();
+ if (!Register::isVirtualRegister(R))
return false;
MachineInstr *DI = MRI->getVRegDef(R);
unsigned DOpc = DI->getOpcode();
@@ -1582,11 +1582,11 @@ void HexagonHardwareLoops::setImmediate(MachineOperand &MO, int64_t Val) {
}
assert(MO.isReg());
- unsigned R = MO.getReg();
+ Register R = MO.getReg();
MachineInstr *DI = MRI->getVRegDef(R);
const TargetRegisterClass *RC = MRI->getRegClass(R);
- unsigned NewR = MRI->createVirtualRegister(RC);
+ Register NewR = MRI->createVirtualRegister(RC);
MachineBasicBlock &B = *DI->getParent();
DebugLoc DL = DI->getDebugLoc();
BuildMI(B, DI, DL, TII->get(DI->getOpcode()), NewR).addImm(Val);
@@ -1634,17 +1634,17 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
if (Phi->getOperand(i+1).getMBB() != Latch)
continue;
- unsigned PhiReg = Phi->getOperand(i).getReg();
+ Register PhiReg = Phi->getOperand(i).getReg();
MachineInstr *DI = MRI->getVRegDef(PhiReg);
if (DI->getDesc().isAdd()) {
// If the register operand to the add/sub is the PHI we are looking
// at, this meets the induction pattern.
- unsigned IndReg = DI->getOperand(1).getReg();
+ Register IndReg = DI->getOperand(1).getReg();
MachineOperand &Opnd2 = DI->getOperand(2);
int64_t V;
if (MRI->getVRegDef(IndReg) == Phi && checkForImmediate(Opnd2, V)) {
- unsigned UpdReg = DI->getOperand(0).getReg();
+ Register UpdReg = DI->getOperand(0).getReg();
IndRegs.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
}
}
@@ -1702,7 +1702,7 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
if (!Cond[CSz-1].isReg())
return false;
- unsigned P = Cond[CSz-1].getReg();
+ Register P = Cond[CSz - 1].getReg();
MachineInstr *PredDef = MRI->getVRegDef(P);
if (!PredDef->isCompare())
@@ -1903,15 +1903,15 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
MachineInstr *NewPN = MF->CreateMachineInstr(PD, DL);
NewPH->insert(NewPH->end(), NewPN);
- unsigned PR = PN->getOperand(0).getReg();
+ Register PR = PN->getOperand(0).getReg();
const TargetRegisterClass *RC = MRI->getRegClass(PR);
- unsigned NewPR = MRI->createVirtualRegister(RC);
+ Register NewPR = MRI->createVirtualRegister(RC);
NewPN->addOperand(MachineOperand::CreateReg(NewPR, true));
// Copy all non-latch operands of a header's PHI node to the newly
// created PHI node in the preheader.
for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) {
- unsigned PredR = PN->getOperand(i).getReg();
+ Register PredR = PN->getOperand(i).getReg();
unsigned PredRSub = PN->getOperand(i).getSubReg();
MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
if (PredB == Latch)
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 605fcfc25559..4684d8e4781a 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -697,7 +697,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
//
void HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
SDLoc dl(N);
- ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+ auto *CN = cast<ConstantFPSDNode>(N);
APInt A = CN->getValueAPF().bitcastToAPInt();
if (N->getValueType(0) == MVT::f32) {
SDValue V = CurDAG->getTargetConstant(A.getZExtValue(), dl, MVT::i32);
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index fef5a98cdb00..8a8986e232a0 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -240,12 +240,12 @@ bool HexagonTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
return true;
}
-unsigned HexagonTargetLowering::getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const {
+Register HexagonTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &) const {
// Just support r19, the linux kernel uses it.
- unsigned Reg = StringSwitch<unsigned>(RegName)
+ Register Reg = StringSwitch<Register>(RegName)
.Case("r19", Hexagon::R19)
- .Default(0);
+ .Default(Register());
if (Reg)
return Reg;
@@ -286,7 +286,7 @@ SDValue HexagonTargetLowering::LowerCallResult(
SDValue FR0 = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
MVT::i32, Glue);
// FR0 = (Value, Chain, Glue)
- unsigned PredR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+ Register PredR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
SDValue TPR = DAG.getCopyToReg(FR0.getValue(1), dl, PredR,
FR0.getValue(0), FR0.getValue(2));
// TPR = (Chain, Glue)
@@ -736,7 +736,7 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
RegVT = VA.getValVT();
const TargetRegisterClass *RC = getRegClassFor(RegVT);
- unsigned VReg = MRI.createVirtualRegister(RC);
+ Register VReg = MRI.createVirtualRegister(RC);
SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
// Treat values of type MVT::i1 specially: they are passed in
@@ -870,15 +870,20 @@ SDValue
HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue PredOp = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1), Op2 = Op.getOperand(2);
- EVT OpVT = Op1.getValueType();
- SDLoc DL(Op);
+ MVT OpTy = ty(Op1);
+ const SDLoc &dl(Op);
- if (OpVT == MVT::v2i16) {
- SDValue X1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op1);
- SDValue X2 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op2);
- SDValue SL = DAG.getNode(ISD::VSELECT, DL, MVT::v2i32, PredOp, X1, X2);
- SDValue TR = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i16, SL);
- return TR;
+ if (OpTy == MVT::v2i16 || OpTy == MVT::v4i8) {
+ MVT ElemTy = OpTy.getVectorElementType();
+ assert(ElemTy.isScalarInteger());
+ MVT WideTy = MVT::getVectorVT(MVT::getIntegerVT(2*ElemTy.getSizeInBits()),
+ OpTy.getVectorNumElements());
+ // Generate (trunc (select (_, sext, sext))).
+ return DAG.getSExtOrTrunc(
+ DAG.getSelect(dl, WideTy, PredOp,
+ DAG.getSExtOrTrunc(Op1, dl, WideTy),
+ DAG.getSExtOrTrunc(Op2, dl, WideTy)),
+ dl, OpTy);
}
return SDValue();
@@ -1230,9 +1235,9 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
Subtarget(ST) {
auto &HRI = *Subtarget.getRegisterInfo();
- setPrefLoopAlignment(4);
- setPrefFunctionAlignment(4);
- setMinFunctionAlignment(2);
+ setPrefLoopAlignment(Align(16));
+ setMinFunctionAlignment(Align(4));
+ setPrefFunctionAlignment(Align(16));
setStackPointerRegisterToSaveRestore(HRI.getStackRegister());
setBooleanContents(TargetLoweringBase::UndefinedBooleanContent);
setBooleanVectorContents(TargetLoweringBase::UndefinedBooleanContent);
@@ -1434,12 +1439,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
ISD::CONCAT_VECTORS, ISD::VECTOR_SHUFFLE
};
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
for (unsigned VectExpOp : VectExpOps)
setOperationAction(VectExpOp, VT, Expand);
// Expand all extending loads and truncating stores:
- for (MVT TargetVT : MVT::vector_valuetypes()) {
+ for (MVT TargetVT : MVT::fixedlen_vector_valuetypes()) {
if (TargetVT == VT)
continue;
setLoadExtAction(ISD::EXTLOAD, TargetVT, VT, Expand);
@@ -1496,16 +1501,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, VT, Custom);
}
- for (MVT VT : {MVT::v2i16, MVT::v4i8, MVT::v2i32, MVT::v4i16, MVT::v2i32}) {
- setCondCodeAction(ISD::SETLT, VT, Expand);
+ for (MVT VT : {MVT::v2i16, MVT::v4i8, MVT::v8i8, MVT::v2i32, MVT::v4i16,
+ MVT::v2i32}) {
+ setCondCodeAction(ISD::SETNE, VT, Expand);
setCondCodeAction(ISD::SETLE, VT, Expand);
- setCondCodeAction(ISD::SETULT, VT, Expand);
+ setCondCodeAction(ISD::SETGE, VT, Expand);
+ setCondCodeAction(ISD::SETLT, VT, Expand);
setCondCodeAction(ISD::SETULE, VT, Expand);
+ setCondCodeAction(ISD::SETUGE, VT, Expand);
+ setCondCodeAction(ISD::SETULT, VT, Expand);
}
// Custom-lower bitcasts from i8 to v8i1.
setOperationAction(ISD::BITCAST, MVT::i8, Custom);
setOperationAction(ISD::SETCC, MVT::v2i16, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4i8, Custom);
setOperationAction(ISD::VSELECT, MVT::v2i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
@@ -1554,6 +1564,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSUB, MVT::f64, Legal);
}
+ setTargetDAGCombine(ISD::VSELECT);
+
if (Subtarget.useHVXOps())
initializeHVXLowering();
@@ -1643,6 +1655,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
case HexagonISD::VINSERTW0: return "HexagonISD::VINSERTW0";
case HexagonISD::VROR: return "HexagonISD::VROR";
case HexagonISD::READCYCLE: return "HexagonISD::READCYCLE";
+ case HexagonISD::PTRUE: return "HexagonISD::PTRUE";
+ case HexagonISD::PFALSE: return "HexagonISD::PFALSE";
case HexagonISD::VZERO: return "HexagonISD::VZERO";
case HexagonISD::VSPLATW: return "HexagonISD::VSPLATW";
case HexagonISD::D2P: return "HexagonISD::D2P";
@@ -1783,7 +1797,8 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
// The offset value comes through Modifier register. For now, assume the
// offset is 0.
Info.offset = 0;
- Info.align = DL.getABITypeAlignment(Info.memVT.getTypeForEVT(Cont));
+ Info.align =
+ MaybeAlign(DL.getABITypeAlignment(Info.memVT.getTypeForEVT(Cont)));
Info.flags = MachineMemOperand::MOLoad;
return true;
}
@@ -1805,7 +1820,8 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(VecTy);
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = M.getDataLayout().getTypeAllocSizeInBits(VecTy) / 8;
+ Info.align =
+ MaybeAlign(M.getDataLayout().getTypeAllocSizeInBits(VecTy) / 8);
Info.flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore |
MachineMemOperand::MOVolatile;
@@ -1817,6 +1833,10 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return false;
}
+bool HexagonTargetLowering::hasBitTest(SDValue X, SDValue Y) const {
+ return X.getValueType().isScalarInteger(); // 'tstbit'
+}
+
bool HexagonTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
return isTruncateFree(EVT::getEVT(Ty1), EVT::getEVT(Ty2));
}
@@ -1844,26 +1864,33 @@ bool HexagonTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask,
TargetLoweringBase::LegalizeTypeAction
HexagonTargetLowering::getPreferredVectorAction(MVT VT) const {
- if (VT.getVectorNumElements() == 1)
- return TargetLoweringBase::TypeScalarizeVector;
-
- // Always widen vectors of i1.
+ unsigned VecLen = VT.getVectorNumElements();
MVT ElemTy = VT.getVectorElementType();
- if (ElemTy == MVT::i1)
- return TargetLoweringBase::TypeWidenVector;
+
+ if (VecLen == 1 || VT.isScalableVector())
+ return TargetLoweringBase::TypeScalarizeVector;
if (Subtarget.useHVXOps()) {
+ unsigned HwLen = Subtarget.getVectorLength();
// If the size of VT is at least half of the vector length,
// widen the vector. Note: the threshold was not selected in
// any scientific way.
ArrayRef<MVT> Tys = Subtarget.getHVXElementTypes();
if (llvm::find(Tys, ElemTy) != Tys.end()) {
- unsigned HwWidth = 8*Subtarget.getVectorLength();
+ unsigned HwWidth = 8*HwLen;
unsigned VecWidth = VT.getSizeInBits();
if (VecWidth >= HwWidth/2 && VecWidth < HwWidth)
return TargetLoweringBase::TypeWidenVector;
}
+ // Split vectors of i1 that correspond to (byte) vector pairs.
+ if (ElemTy == MVT::i1 && VecLen == 2*HwLen)
+ return TargetLoweringBase::TypeSplitVector;
}
+
+ // Always widen (remaining) vectors of i1.
+ if (ElemTy == MVT::i1)
+ return TargetLoweringBase::TypeWidenVector;
+
return TargetLoweringBase::TypeSplitVector;
}
@@ -2452,6 +2479,23 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return buildVector64(Ops, dl, VecTy, DAG);
if (VecTy == MVT::v8i1 || VecTy == MVT::v4i1 || VecTy == MVT::v2i1) {
+ // Check if this is a special case or all-0 or all-1.
+ bool All0 = true, All1 = true;
+ for (SDValue P : Ops) {
+ auto *CN = dyn_cast<ConstantSDNode>(P.getNode());
+ if (CN == nullptr) {
+ All0 = All1 = false;
+ break;
+ }
+ uint32_t C = CN->getZExtValue();
+ All0 &= (C == 0);
+ All1 &= (C == 1);
+ }
+ if (All0)
+ return DAG.getNode(HexagonISD::PFALSE, dl, VecTy);
+ if (All1)
+ return DAG.getNode(HexagonISD::PTRUE, dl, VecTy);
+
// For each i1 element in the resulting predicate register, put 1
// shifted by the index of the element into a general-purpose register,
// then or them together and transfer it back into a predicate register.
@@ -2629,7 +2673,8 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
DoDefault = true;
if (!AlignLoads) {
- if (allowsMemoryAccess(Ctx, DL, LN->getMemoryVT(), *LN->getMemOperand()))
+ if (allowsMemoryAccessForAlignment(Ctx, DL, LN->getMemoryVT(),
+ *LN->getMemOperand()))
return Op;
DoDefault = true;
}
@@ -2637,7 +2682,8 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
// The PartTy is the equivalent of "getLoadableTypeOfSize(HaveAlign)".
MVT PartTy = HaveAlign <= 8 ? MVT::getIntegerVT(8 * HaveAlign)
: MVT::getVectorVT(MVT::i8, HaveAlign);
- DoDefault = allowsMemoryAccess(Ctx, DL, PartTy, *LN->getMemOperand());
+ DoDefault =
+ allowsMemoryAccessForAlignment(Ctx, DL, PartTy, *LN->getMemOperand());
}
if (DoDefault) {
std::pair<SDValue, SDValue> P = expandUnalignedLoad(LN, DAG);
@@ -2865,12 +2911,54 @@ HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
if (N->getValueType(0) == MVT::i8) {
SDValue P = getInstr(Hexagon::C2_tfrpr, dl, MVT::i32,
N->getOperand(0), DAG);
- Results.push_back(P);
+ SDValue T = DAG.getAnyExtOrTrunc(P, dl, MVT::i8);
+ Results.push_back(T);
}
break;
}
}
+SDValue
+HexagonTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
+ const {
+ SDValue Op(N, 0);
+ if (isHvxOperation(Op)) {
+ if (SDValue V = PerformHvxDAGCombine(N, DCI))
+ return V;
+ return SDValue();
+ }
+
+ const SDLoc &dl(Op);
+ unsigned Opc = Op.getOpcode();
+
+ if (Opc == HexagonISD::P2D) {
+ SDValue P = Op.getOperand(0);
+ switch (P.getOpcode()) {
+ case HexagonISD::PTRUE:
+ return DCI.DAG.getConstant(-1, dl, ty(Op));
+ case HexagonISD::PFALSE:
+ return getZero(dl, ty(Op), DCI.DAG);
+ default:
+ break;
+ }
+ } else if (Opc == ISD::VSELECT) {
+ // This is pretty much duplicated in HexagonISelLoweringHVX...
+ //
+ // (vselect (xor x, ptrue), v0, v1) -> (vselect x, v1, v0)
+ SDValue Cond = Op.getOperand(0);
+ if (Cond->getOpcode() == ISD::XOR) {
+ SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
+ if (C1->getOpcode() == HexagonISD::PTRUE) {
+ SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0,
+ Op.getOperand(2), Op.getOperand(1));
+ return VSel;
+ }
+ }
+ }
+
+ return SDValue();
+}
+
/// Returns relocation base for the given PIC jumptable.
SDValue
HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table,
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 4e467cb22727..75f553bfec7f 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -68,6 +68,8 @@ namespace HexagonISD {
EH_RETURN,
DCFETCH,
READCYCLE,
+ PTRUE,
+ PFALSE,
D2P, // Convert 8-byte value to 8-bit predicate register. [*]
P2D, // Convert 8-bit predicate register to 8-byte value. [*]
V2Q, // Convert HVX vector to a vector predicate reg. [*]
@@ -127,6 +129,8 @@ namespace HexagonISD {
bool isCheapToSpeculateCtlz() const override { return true; }
bool isCtlzFast() const override { return true; }
+ bool hasBitTest(SDValue X, SDValue Y) const override;
+
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
/// Return true if an FMA operation is faster than a pair of mul and add
@@ -221,10 +225,12 @@ namespace HexagonISD {
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &dl, SelectionDAG &DAG) const override;
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.
@@ -299,7 +305,8 @@ namespace HexagonISD {
const AttributeList &FuncAttributes) const override;
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
- unsigned Align, MachineMemOperand::Flags Flags, bool *Fast) const override;
+ unsigned Align, MachineMemOperand::Flags Flags, bool *Fast)
+ const override;
/// Returns relocation base for the given PIC jumptable.
SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
@@ -456,6 +463,8 @@ namespace HexagonISD {
bool isHvxOperation(SDValue Op) const;
SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
};
} // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 345c657787a0..bc8a9959c917 100644
--- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -193,6 +193,8 @@ HexagonTargetLowering::initializeHVXLowering() {
setOperationAction(ISD::OR, BoolV, Legal);
setOperationAction(ISD::XOR, BoolV, Legal);
}
+
+ setTargetDAGCombine(ISD::VSELECT);
}
SDValue
@@ -1580,6 +1582,28 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("Unhandled HVX operation");
}
+SDValue
+HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
+ const {
+ const SDLoc &dl(N);
+ SDValue Op(N, 0);
+
+ unsigned Opc = Op.getOpcode();
+ if (Opc == ISD::VSELECT) {
+ // (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0)
+ SDValue Cond = Op.getOperand(0);
+ if (Cond->getOpcode() == ISD::XOR) {
+ SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
+ if (C1->getOpcode() == HexagonISD::QTRUE) {
+ SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0,
+ Op.getOperand(2), Op.getOperand(1));
+ return VSel;
+ }
+ }
+ }
+ return SDValue();
+}
+
bool
HexagonTargetLowering::isHvxOperation(SDValue Op) const {
// If the type of the result, or any operand type are HVX vector types,
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index a156de5ba128..767538f92ed6 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -193,7 +193,7 @@ static inline void parseOperands(const MachineInstr &MI,
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (!Reg)
continue;
@@ -674,86 +674,96 @@ unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB,
return 2;
}
-/// Analyze the loop code to find the loop induction variable and compare used
-/// to compute the number of iterations. Currently, we analyze loop that are
-/// controlled using hardware loops. In this case, the induction variable
-/// instruction is null. For all other cases, this function returns true, which
-/// means we're unable to analyze it.
-bool HexagonInstrInfo::analyzeLoop(MachineLoop &L,
- MachineInstr *&IndVarInst,
- MachineInstr *&CmpInst) const {
+namespace {
+class HexagonPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
+ MachineInstr *Loop, *EndLoop;
+ MachineFunction *MF;
+ const HexagonInstrInfo *TII;
+ int64_t TripCount;
+ Register LoopCount;
+ DebugLoc DL;
- MachineBasicBlock *LoopEnd = L.getBottomBlock();
- MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
- // We really "analyze" only hardware loops right now.
- if (I != LoopEnd->end() && isEndLoopN(I->getOpcode())) {
- IndVarInst = nullptr;
- CmpInst = &*I;
- return false;
+public:
+ HexagonPipelinerLoopInfo(MachineInstr *Loop, MachineInstr *EndLoop)
+ : Loop(Loop), EndLoop(EndLoop), MF(Loop->getParent()->getParent()),
+ TII(MF->getSubtarget<HexagonSubtarget>().getInstrInfo()),
+ DL(Loop->getDebugLoc()) {
+ // Inspect the Loop instruction up-front, as it may be deleted when we call
+ // createTripCountGreaterCondition.
+ TripCount = Loop->getOpcode() == Hexagon::J2_loop0r
+ ? -1
+ : Loop->getOperand(1).getImm();
+ if (TripCount == -1)
+ LoopCount = Loop->getOperand(1).getReg();
}
- return true;
-}
-/// Generate code to reduce the loop iteration by one and check if the loop is
-/// finished. Return the value/register of the new loop count. this function
-/// assumes the nth iteration is peeled first.
-unsigned HexagonInstrInfo::reduceLoopCount(
- MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
- MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
- SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
- unsigned MaxIter) const {
- // We expect a hardware loop currently. This means that IndVar is set
- // to null, and the compare is the ENDLOOP instruction.
- assert((!IndVar) && isEndLoopN(Cmp.getOpcode())
- && "Expecting a hardware loop");
- MachineFunction *MF = MBB.getParent();
- DebugLoc DL = Cmp.getDebugLoc();
- SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
- MachineInstr *Loop = findLoopInstr(&MBB, Cmp.getOpcode(),
- Cmp.getOperand(0).getMBB(), VisitedBBs);
- if (!Loop)
- return 0;
- // If the loop trip count is a compile-time value, then just change the
- // value.
- if (Loop->getOpcode() == Hexagon::J2_loop0i ||
- Loop->getOpcode() == Hexagon::J2_loop1i) {
- int64_t Offset = Loop->getOperand(1).getImm();
- if (Offset <= 1)
- Loop->eraseFromParent();
- else
- Loop->getOperand(1).setImm(Offset - 1);
- return Offset - 1;
+ bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
+ // Only ignore the terminator.
+ return MI == EndLoop;
}
- // The loop trip count is a run-time value. We generate code to subtract
- // one from the trip count, and update the loop instruction.
- assert(Loop->getOpcode() == Hexagon::J2_loop0r && "Unexpected instruction");
- unsigned LoopCount = Loop->getOperand(1).getReg();
- // Check if we're done with the loop.
- unsigned LoopEnd = createVR(MF, MVT::i1);
- MachineInstr *NewCmp = BuildMI(&MBB, DL, get(Hexagon::C2_cmpgtui), LoopEnd).
- addReg(LoopCount).addImm(1);
- unsigned NewLoopCount = createVR(MF, MVT::i32);
- MachineInstr *NewAdd = BuildMI(&MBB, DL, get(Hexagon::A2_addi), NewLoopCount).
- addReg(LoopCount).addImm(-1);
- const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
- // Update the previously generated instructions with the new loop counter.
- for (SmallVectorImpl<MachineInstr *>::iterator I = PrevInsts.begin(),
- E = PrevInsts.end(); I != E; ++I)
- (*I)->substituteRegister(LoopCount, NewLoopCount, 0, HRI);
- PrevInsts.clear();
- PrevInsts.push_back(NewCmp);
- PrevInsts.push_back(NewAdd);
- // Insert the new loop instruction if this is the last time the loop is
- // decremented.
- if (Iter == MaxIter)
- BuildMI(&MBB, DL, get(Hexagon::J2_loop0r)).
- addMBB(Loop->getOperand(0).getMBB()).addReg(NewLoopCount);
- // Delete the old loop instruction.
- if (Iter == 0)
- Loop->eraseFromParent();
- Cond.push_back(MachineOperand::CreateImm(Hexagon::J2_jumpf));
- Cond.push_back(NewCmp->getOperand(0));
- return NewLoopCount;
+
+ Optional<bool>
+ createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
+ SmallVectorImpl<MachineOperand> &Cond) override {
+ if (TripCount == -1) {
+ // Check if we're done with the loop.
+ unsigned Done = TII->createVR(MF, MVT::i1);
+ MachineInstr *NewCmp = BuildMI(&MBB, DL,
+ TII->get(Hexagon::C2_cmpgtui), Done)
+ .addReg(LoopCount)
+ .addImm(TC);
+ Cond.push_back(MachineOperand::CreateImm(Hexagon::J2_jumpf));
+ Cond.push_back(NewCmp->getOperand(0));
+ return {};
+ }
+
+ return TripCount > TC;
+ }
+
+ void setPreheader(MachineBasicBlock *NewPreheader) override {
+ NewPreheader->splice(NewPreheader->getFirstTerminator(), Loop->getParent(),
+ Loop);
+ }
+
+ void adjustTripCount(int TripCountAdjust) override {
+ // If the loop trip count is a compile-time value, then just change the
+ // value.
+ if (Loop->getOpcode() == Hexagon::J2_loop0i ||
+ Loop->getOpcode() == Hexagon::J2_loop1i) {
+ int64_t TripCount = Loop->getOperand(1).getImm() + TripCountAdjust;
+ assert(TripCount > 0 && "Can't create an empty or negative loop!");
+ Loop->getOperand(1).setImm(TripCount);
+ return;
+ }
+
+ // The loop trip count is a run-time value. We generate code to subtract
+ // one from the trip count, and update the loop instruction.
+ Register LoopCount = Loop->getOperand(1).getReg();
+ Register NewLoopCount = TII->createVR(MF, MVT::i32);
+ BuildMI(*Loop->getParent(), Loop, Loop->getDebugLoc(),
+ TII->get(Hexagon::A2_addi), NewLoopCount)
+ .addReg(LoopCount)
+ .addImm(TripCountAdjust);
+ Loop->getOperand(1).setReg(NewLoopCount);
+ }
+
+ void disposed() override { Loop->eraseFromParent(); }
+};
+} // namespace
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+HexagonInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+ // We really "analyze" only hardware loops right now.
+ MachineBasicBlock::iterator I = LoopBB->getFirstTerminator();
+
+ if (I != LoopBB->end() && isEndLoopN(I->getOpcode())) {
+ SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+ MachineInstr *LoopInst = findLoopInstr(
+ LoopBB, I->getOpcode(), I->getOperand(0).getMBB(), VisitedBBs);
+ if (LoopInst)
+ return std::make_unique<HexagonPipelinerLoopInfo>(LoopInst, &*I);
+ }
+ return nullptr;
}
bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
@@ -839,8 +849,8 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
if (Hexagon::HvxWRRegClass.contains(SrcReg, DestReg)) {
- unsigned LoSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
- unsigned HiSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+ Register LoSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+ Register HiSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
BuildMI(MBB, I, DL, get(Hexagon::V6_vcombine), DestReg)
.addReg(HiSrc, KillFlag)
.addReg(LoSrc, KillFlag);
@@ -1017,7 +1027,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
auto RealCirc = [&](unsigned Opc, bool HasImm, unsigned MxOp) {
- unsigned Mx = MI.getOperand(MxOp).getReg();
+ Register Mx = MI.getOperand(MxOp).getReg();
unsigned CSx = (Mx == Hexagon::M0 ? Hexagon::CS0 : Hexagon::CS1);
BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrrcr), CSx)
.add(MI.getOperand((HasImm ? 5 : 4)));
@@ -1049,8 +1059,8 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MBB.erase(MI);
return true;
case Hexagon::V6_vassignp: {
- unsigned SrcReg = MI.getOperand(1).getReg();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
unsigned Kill = getKillRegState(MI.getOperand(1).isKill());
BuildMI(MBB, MI, DL, get(Hexagon::V6_vcombine), DstReg)
.addReg(HRI.getSubReg(SrcReg, Hexagon::vsub_hi), Kill)
@@ -1059,18 +1069,18 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
case Hexagon::V6_lo: {
- unsigned SrcReg = MI.getOperand(1).getReg();
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
copyPhysReg(MBB, MI, DL, DstReg, SrcSubLo, MI.getOperand(1).isKill());
MBB.erase(MI);
MRI.clearKillFlags(SrcSubLo);
return true;
}
case Hexagon::V6_hi: {
- unsigned SrcReg = MI.getOperand(1).getReg();
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
copyPhysReg(MBB, MI, DL, DstReg, SrcSubHi, MI.getOperand(1).isKill());
MBB.erase(MI);
MRI.clearKillFlags(SrcSubHi);
@@ -1079,9 +1089,9 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case Hexagon::PS_vstorerw_ai:
case Hexagon::PS_vstorerwu_ai: {
bool Aligned = Opc == Hexagon::PS_vstorerw_ai;
- unsigned SrcReg = MI.getOperand(2).getReg();
- unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
- unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+ Register SrcReg = MI.getOperand(2).getReg();
+ Register SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+ Register SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
unsigned NewOpc = Aligned ? Hexagon::V6_vS32b_ai : Hexagon::V6_vS32Ub_ai;
unsigned Offset = HRI.getSpillSize(Hexagon::HvxVRRegClass);
@@ -1103,7 +1113,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case Hexagon::PS_vloadrw_ai:
case Hexagon::PS_vloadrwu_ai: {
bool Aligned = Opc == Hexagon::PS_vloadrw_ai;
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
unsigned NewOpc = Aligned ? Hexagon::V6_vL32b_ai : Hexagon::V6_vL32Ub_ai;
unsigned Offset = HRI.getSpillSize(Hexagon::HvxVRRegClass);
@@ -1122,7 +1132,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
case Hexagon::PS_true: {
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
BuildMI(MBB, MI, DL, get(Hexagon::C2_orn), Reg)
.addReg(Reg, RegState::Undef)
.addReg(Reg, RegState::Undef);
@@ -1130,7 +1140,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
case Hexagon::PS_false: {
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
BuildMI(MBB, MI, DL, get(Hexagon::C2_andn), Reg)
.addReg(Reg, RegState::Undef)
.addReg(Reg, RegState::Undef);
@@ -1152,7 +1162,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
case Hexagon::PS_vdd0: {
- unsigned Vd = MI.getOperand(0).getReg();
+ Register Vd = MI.getOperand(0).getReg();
BuildMI(MBB, MI, DL, get(Hexagon::V6_vsubw_dv), Vd)
.addReg(Vd, RegState::Undef)
.addReg(Vd, RegState::Undef);
@@ -1161,13 +1171,13 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
case Hexagon::PS_vmulw: {
// Expand a 64-bit vector multiply into 2 32-bit scalar multiplies.
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned Src1Reg = MI.getOperand(1).getReg();
- unsigned Src2Reg = MI.getOperand(2).getReg();
- unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi);
- unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo);
- unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi);
- unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo);
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src1Reg = MI.getOperand(1).getReg();
+ Register Src2Reg = MI.getOperand(2).getReg();
+ Register Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi);
+ Register Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo);
+ Register Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi);
+ Register Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo);
BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_mpyi),
HRI.getSubReg(DstReg, Hexagon::isub_hi))
.addReg(Src1SubHi)
@@ -1185,16 +1195,16 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
case Hexagon::PS_vmulw_acc: {
// Expand 64-bit vector multiply with addition into 2 scalar multiplies.
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned Src1Reg = MI.getOperand(1).getReg();
- unsigned Src2Reg = MI.getOperand(2).getReg();
- unsigned Src3Reg = MI.getOperand(3).getReg();
- unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi);
- unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo);
- unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi);
- unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo);
- unsigned Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::isub_hi);
- unsigned Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::isub_lo);
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src1Reg = MI.getOperand(1).getReg();
+ Register Src2Reg = MI.getOperand(2).getReg();
+ Register Src3Reg = MI.getOperand(3).getReg();
+ Register Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi);
+ Register Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo);
+ Register Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi);
+ Register Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo);
+ Register Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::isub_hi);
+ Register Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::isub_lo);
BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_maci),
HRI.getSubReg(DstReg, Hexagon::isub_hi))
.addReg(Src1SubHi)
@@ -1219,10 +1229,10 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
const MachineOperand &Op1 = MI.getOperand(1);
const MachineOperand &Op2 = MI.getOperand(2);
const MachineOperand &Op3 = MI.getOperand(3);
- unsigned Rd = Op0.getReg();
- unsigned Pu = Op1.getReg();
- unsigned Rs = Op2.getReg();
- unsigned Rt = Op3.getReg();
+ Register Rd = Op0.getReg();
+ Register Pu = Op1.getReg();
+ Register Rs = Op2.getReg();
+ Register Rt = Op3.getReg();
DebugLoc DL = MI.getDebugLoc();
unsigned K1 = getKillRegState(Op1.isKill());
unsigned K2 = getKillRegState(Op2.isKill());
@@ -1246,7 +1256,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
LivePhysRegs LiveAtMI(HRI);
getLiveRegsAt(LiveAtMI, MI);
bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
- unsigned PReg = Op1.getReg();
+ Register PReg = Op1.getReg();
assert(Op1.getSubReg() == 0);
unsigned PState = getRegState(Op1);
@@ -1280,15 +1290,15 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
LivePhysRegs LiveAtMI(HRI);
getLiveRegsAt(LiveAtMI, MI);
bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
- unsigned PReg = Op1.getReg();
+ Register PReg = Op1.getReg();
assert(Op1.getSubReg() == 0);
unsigned PState = getRegState(Op1);
if (Op0.getReg() != Op2.getReg()) {
unsigned S = Op0.getReg() != Op3.getReg() ? PState & ~RegState::Kill
: PState;
- unsigned SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo);
- unsigned SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi);
+ Register SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo);
+ Register SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi);
auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine))
.add(Op0)
.addReg(PReg, S)
@@ -1299,8 +1309,8 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
IsDestLive = true;
}
if (Op0.getReg() != Op3.getReg()) {
- unsigned SrcLo = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_lo);
- unsigned SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi);
+ Register SrcLo = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_lo);
+ Register SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi);
auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vnccombine))
.add(Op0)
.addReg(PReg, PState)
@@ -1856,8 +1866,7 @@ DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
// S2_storeri_io %r29, 132, killed %r1; flags: mem:ST4[FixedStack1]
// Currently AA considers the addresses in these instructions to be aliasing.
bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
- const MachineInstr &MIa, const MachineInstr &MIb,
- AliasAnalysis *AA) const {
+ const MachineInstr &MIa, const MachineInstr &MIb) const {
if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
return false;
@@ -1872,7 +1881,7 @@ bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
if (!getBaseAndOffsetPosition(MIa, BasePosA, OffsetPosA))
return false;
const MachineOperand &BaseA = MIa.getOperand(BasePosA);
- unsigned BaseRegA = BaseA.getReg();
+ Register BaseRegA = BaseA.getReg();
unsigned BaseSubA = BaseA.getSubReg();
// Get the base register in MIb.
@@ -1880,7 +1889,7 @@ bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
if (!getBaseAndOffsetPosition(MIb, BasePosB, OffsetPosB))
return false;
const MachineOperand &BaseB = MIb.getOperand(BasePosB);
- unsigned BaseRegB = BaseB.getReg();
+ Register BaseRegB = BaseB.getReg();
unsigned BaseSubB = BaseB.getSubReg();
if (BaseRegA != BaseRegB || BaseSubA != BaseSubB)
@@ -1984,7 +1993,7 @@ unsigned HexagonInstrInfo::createVR(MachineFunction *MF, MVT VT) const {
llvm_unreachable("Cannot handle this register class");
}
- unsigned NewReg = MRI.createVirtualRegister(TRC);
+ Register NewReg = MRI.createVirtualRegister(TRC);
return NewReg;
}
@@ -2094,12 +2103,12 @@ bool HexagonInstrInfo::isDependent(const MachineInstr &ProdMI,
if (RegA == RegB)
return true;
- if (TargetRegisterInfo::isPhysicalRegister(RegA))
+ if (Register::isPhysicalRegister(RegA))
for (MCSubRegIterator SubRegs(RegA, &HRI); SubRegs.isValid(); ++SubRegs)
if (RegB == *SubRegs)
return true;
- if (TargetRegisterInfo::isPhysicalRegister(RegB))
+ if (Register::isPhysicalRegister(RegB))
for (MCSubRegIterator SubRegs(RegB, &HRI); SubRegs.isValid(); ++SubRegs)
if (RegA == *SubRegs)
return true;
@@ -2605,7 +2614,7 @@ bool HexagonInstrInfo::isToBeScheduledASAP(const MachineInstr &MI1,
const MachineInstr &MI2) const {
if (mayBeCurLoad(MI1)) {
// if (result of SU is used in Next) return true;
- unsigned DstReg = MI1.getOperand(0).getReg();
+ Register DstReg = MI1.getOperand(0).getReg();
int N = MI2.getNumOperands();
for (int I = 0; I < N; I++)
if (MI2.getOperand(I).isReg() && DstReg == MI2.getOperand(I).getReg())
@@ -3374,7 +3383,7 @@ unsigned HexagonInstrInfo::getCompoundOpcode(const MachineInstr &GA,
if ((GA.getOpcode() != Hexagon::C2_cmpeqi) ||
(GB.getOpcode() != Hexagon::J2_jumptnew))
return -1u;
- unsigned DestReg = GA.getOperand(0).getReg();
+ Register DestReg = GA.getOperand(0).getReg();
if (!GB.readsRegister(DestReg))
return -1u;
if (DestReg != Hexagon::P0 && DestReg != Hexagon::P1)
@@ -4091,7 +4100,7 @@ int HexagonInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
// Get DefIdx and UseIdx for super registers.
const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
- if (DefMO.isReg() && HRI.isPhysicalRegister(DefMO.getReg())) {
+ if (DefMO.isReg() && Register::isPhysicalRegister(DefMO.getReg())) {
if (DefMO.isImplicit()) {
for (MCSuperRegIterator SR(DefMO.getReg(), &HRI); SR.isValid(); ++SR) {
int Idx = DefMI.findRegisterDefOperandIdx(*SR, false, false, &HRI);
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index e0a999d0f4c4..60298cd666bb 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -129,21 +129,10 @@ public:
const DebugLoc &DL,
int *BytesAdded = nullptr) const override;
- /// Analyze the loop code, return true if it cannot be understood. Upon
- /// success, this function returns false and returns information about the
- /// induction variable and compare instruction used at the end.
- bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
- MachineInstr *&CmpInst) const override;
-
- /// Generate code to reduce the loop iteration by one and check if the loop
- /// is finished. Return the value/register of the new loop count. We need
- /// this function when peeling off one or more iterations of a loop. This
- /// function assumes the nth iteration is peeled first.
- unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
- MachineInstr *IndVar, MachineInstr &Cmp,
- SmallVectorImpl<MachineOperand> &Cond,
- SmallVectorImpl<MachineInstr *> &PrevInsts,
- unsigned Iter, unsigned MaxIter) const override;
+ /// Analyze loop L, which must be a single-basic-block loop, and if the
+ /// conditions can be understood enough produce a PipelinerLoopInfo object.
+ std::unique_ptr<PipelinerLoopInfo>
+ analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
/// Return true if it's profitable to predicate
/// instructions with accumulated instruction latency of "NumCycles"
@@ -299,8 +288,7 @@ public:
// memory addresses and false otherwise.
bool
areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
- const MachineInstr &MIb,
- AliasAnalysis *AA = nullptr) const override;
+ const MachineInstr &MIb) const override;
/// For instructions with a base and offset, return the position of the
/// base register and offset operands.
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index cabfd783effa..c5e3cfd080d6 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -22,14 +22,14 @@ class T_RP_pat <InstHexagon MI, Intrinsic IntID>
def: Pat<(int_hexagon_A2_add IntRegs:$Rs, IntRegs:$Rt),
(A2_add IntRegs:$Rs, IntRegs:$Rt)>;
-def: Pat<(int_hexagon_A2_addi IntRegs:$Rs, imm:$s16),
+def: Pat<(int_hexagon_A2_addi IntRegs:$Rs, timm:$s16),
(A2_addi IntRegs:$Rs, imm:$s16)>;
def: Pat<(int_hexagon_A2_addp DoubleRegs:$Rs, DoubleRegs:$Rt),
(A2_addp DoubleRegs:$Rs, DoubleRegs:$Rt)>;
def: Pat<(int_hexagon_A2_sub IntRegs:$Rs, IntRegs:$Rt),
(A2_sub IntRegs:$Rs, IntRegs:$Rt)>;
-def: Pat<(int_hexagon_A2_subri imm:$s10, IntRegs:$Rs),
+def: Pat<(int_hexagon_A2_subri timm:$s10, IntRegs:$Rs),
(A2_subri imm:$s10, IntRegs:$Rs)>;
def: Pat<(int_hexagon_A2_subp DoubleRegs:$Rs, DoubleRegs:$Rt),
(A2_subp DoubleRegs:$Rs, DoubleRegs:$Rt)>;
@@ -45,26 +45,26 @@ def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$Rs, IntRegs:$Rt),
def: Pat<(int_hexagon_M2_dpmpyuu_s0 IntRegs:$Rs, IntRegs:$Rt),
(M2_dpmpyuu_s0 IntRegs:$Rs, IntRegs:$Rt)>;
-def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$Rs, imm:$u5),
+def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$Rs, timm:$u5),
(S2_asl_i_r IntRegs:$Rs, imm:$u5)>;
-def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$Rs, imm:$u5),
+def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$Rs, timm:$u5),
(S2_lsr_i_r IntRegs:$Rs, imm:$u5)>;
-def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$Rs, imm:$u5),
+def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$Rs, timm:$u5),
(S2_asr_i_r IntRegs:$Rs, imm:$u5)>;
-def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$Rs, imm:$u6),
+def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$Rs, timm:$u6),
(S2_asl_i_p DoubleRegs:$Rs, imm:$u6)>;
-def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$Rs, imm:$u6),
+def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$Rs, timm:$u6),
(S2_lsr_i_p DoubleRegs:$Rs, imm:$u6)>;
-def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$Rs, imm:$u6),
+def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$Rs, timm:$u6),
(S2_asr_i_p DoubleRegs:$Rs, imm:$u6)>;
def: Pat<(int_hexagon_A2_and IntRegs:$Rs, IntRegs:$Rt),
(A2_and IntRegs:$Rs, IntRegs:$Rt)>;
-def: Pat<(int_hexagon_A2_andir IntRegs:$Rs, imm:$s10),
+def: Pat<(int_hexagon_A2_andir IntRegs:$Rs, timm:$s10),
(A2_andir IntRegs:$Rs, imm:$s10)>;
def: Pat<(int_hexagon_A2_or IntRegs:$Rs, IntRegs:$Rt),
(A2_or IntRegs:$Rs, IntRegs:$Rt)>;
-def: Pat<(int_hexagon_A2_orir IntRegs:$Rs, imm:$s10),
+def: Pat<(int_hexagon_A2_orir IntRegs:$Rs, timm:$s10),
(A2_orir IntRegs:$Rs, imm:$s10)>;
def: Pat<(int_hexagon_A2_xor IntRegs:$Rs, IntRegs:$Rt),
(A2_xor IntRegs:$Rs, IntRegs:$Rt)>;
@@ -99,13 +99,13 @@ def : Pat <(int_hexagon_S5_asrhub_rnd_sat_goodsyntax I64:$Rs, (i32 0)),
(S2_vsathub I64:$Rs)>;
}
-def : Pat <(int_hexagon_S2_asr_i_r_rnd_goodsyntax I32:$Rs, u5_0ImmPred:$imm),
+def : Pat <(int_hexagon_S2_asr_i_r_rnd_goodsyntax I32:$Rs, u5_0ImmPred_timm:$imm),
(S2_asr_i_r_rnd I32:$Rs, (UDEC1 u5_0ImmPred:$imm))>;
-def : Pat <(int_hexagon_S2_asr_i_p_rnd_goodsyntax I64:$Rs, u6_0ImmPred:$imm),
+def : Pat <(int_hexagon_S2_asr_i_p_rnd_goodsyntax I64:$Rs, u6_0ImmPred_timm:$imm),
(S2_asr_i_p_rnd I64:$Rs, (UDEC1 u6_0ImmPred:$imm))>;
-def : Pat <(int_hexagon_S5_vasrhrnd_goodsyntax I64:$Rs, u4_0ImmPred:$imm),
+def : Pat <(int_hexagon_S5_vasrhrnd_goodsyntax I64:$Rs, u4_0ImmPred_timm:$imm),
(S5_vasrhrnd I64:$Rs, (UDEC1 u4_0ImmPred:$imm))>;
-def : Pat <(int_hexagon_S5_asrhub_rnd_sat_goodsyntax I64:$Rs, u4_0ImmPred:$imm),
+def : Pat <(int_hexagon_S5_asrhub_rnd_sat_goodsyntax I64:$Rs, u4_0ImmPred_timm:$imm),
(S5_asrhub_rnd_sat I64:$Rs, (UDEC1 u4_0ImmPred:$imm))>;
def ImmExt64: SDNodeXForm<imm, [{
@@ -121,13 +121,13 @@ def ImmExt64: SDNodeXForm<imm, [{
// To connect the builtin with the instruction, the builtin's operand
// needs to be extended to the right type.
-def : Pat<(int_hexagon_A2_tfrpi imm:$Is),
+def : Pat<(int_hexagon_A2_tfrpi timm:$Is),
(A2_tfrpi (ImmExt64 $Is))>;
-def : Pat <(int_hexagon_C2_cmpgei I32:$src1, s32_0ImmPred:$src2),
+def : Pat <(int_hexagon_C2_cmpgei I32:$src1, s32_0ImmPred_timm:$src2),
(C2_tfrpr (C2_cmpgti I32:$src1, (SDEC1 s32_0ImmPred:$src2)))>;
-def : Pat <(int_hexagon_C2_cmpgeui I32:$src1, u32_0ImmPred:$src2),
+def : Pat <(int_hexagon_C2_cmpgeui I32:$src1, u32_0ImmPred_timm:$src2),
(C2_tfrpr (C2_cmpgtui I32:$src1, (UDEC1 u32_0ImmPred:$src2)))>;
def : Pat <(int_hexagon_C2_cmpgeui I32:$src, 0),
@@ -142,7 +142,7 @@ def : Pat <(int_hexagon_C2_cmpltu I32:$src1, I32:$src2),
//===----------------------------------------------------------------------===//
class S2op_tableidx_pat <Intrinsic IntID, InstHexagon OutputInst,
SDNodeXForm XformImm>
- : Pat <(IntID I32:$src1, I32:$src2, u4_0ImmPred:$src3, u5_0ImmPred:$src4),
+ : Pat <(IntID I32:$src1, I32:$src2, u4_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4),
(OutputInst I32:$src1, I32:$src2, u4_0ImmPred:$src3,
(XformImm u5_0ImmPred:$src4))>;
@@ -197,11 +197,11 @@ class T_stc_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Imm, PatLeaf Val>
: Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru, Imm:$s),
(MI I32:$Rs, Imm:$s, I32:$Ru, Val:$Rt)>;
-def: T_stc_pat<S2_storerb_pci, int_hexagon_circ_stb, s4_0ImmPred, I32>;
-def: T_stc_pat<S2_storerh_pci, int_hexagon_circ_sth, s4_1ImmPred, I32>;
-def: T_stc_pat<S2_storeri_pci, int_hexagon_circ_stw, s4_2ImmPred, I32>;
-def: T_stc_pat<S2_storerd_pci, int_hexagon_circ_std, s4_3ImmPred, I64>;
-def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred, I32>;
+def: T_stc_pat<S2_storerb_pci, int_hexagon_circ_stb, s4_0ImmPred_timm, I32>;
+def: T_stc_pat<S2_storerh_pci, int_hexagon_circ_sth, s4_1ImmPred_timm, I32>;
+def: T_stc_pat<S2_storeri_pci, int_hexagon_circ_stw, s4_2ImmPred_timm, I32>;
+def: T_stc_pat<S2_storerd_pci, int_hexagon_circ_std, s4_3ImmPred_timm, I64>;
+def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred_timm, I32>;
multiclass MaskedStore <InstHexagon MI, Intrinsic IntID> {
def : Pat<(IntID HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index ac48e1dc30b0..bda3eccac0cd 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -93,9 +93,9 @@ static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
cl::Hidden, cl::init(true),
cl::desc("Only enable generating memmove in non-nested loops"));
-cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy",
- cl::Hidden, cl::init(false),
- cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
+static cl::opt<bool> HexagonVolatileMemcpy(
+ "disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false),
+ cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
@@ -632,9 +632,9 @@ Value *PolynomialMultiplyRecognize::getCountIV(BasicBlock *BB) {
if (!isa<ConstantInt>(InitV) || !cast<ConstantInt>(InitV)->isZero())
continue;
Value *IterV = PN->getIncomingValueForBlock(BB);
- if (!isa<BinaryOperator>(IterV))
- continue;
auto *BO = dyn_cast<BinaryOperator>(IterV);
+ if (!BO)
+ continue;
if (BO->getOpcode() != Instruction::Add)
continue;
Value *IncV = nullptr;
@@ -2020,7 +2020,7 @@ bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
// See if the pointer expression is an AddRec like {base,+,1} on the current
// loop, which indicates a strided load. If we have something else, it's a
// random load we can't handle.
- LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
// The trip count of the loop and the base pointer of the addrec SCEV is
@@ -2426,7 +2426,8 @@ bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
DL = &L->getHeader()->getModule()->getDataLayout();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+ *L->getHeader()->getParent());
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
HasMemcpy = TLI->has(LibFunc_memcpy);
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index db44901ca706..680d01e12af0 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -177,7 +177,7 @@ static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
(II->getOperand(i).isUse() || II->getOperand(i).isDef())) {
MachineBasicBlock::iterator localII = II;
++localII;
- unsigned Reg = II->getOperand(i).getReg();
+ Register Reg = II->getOperand(i).getReg();
for (MachineBasicBlock::iterator localBegin = localII; localBegin != end;
++localBegin) {
if (localBegin == skip)
@@ -290,7 +290,7 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
// at machine code level, we don't need this, but if we decide
// to move new value jump prior to RA, we would be needing this.
MachineRegisterInfo &MRI = MF.getRegInfo();
- if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) {
+ if (secondReg && !Register::isPhysicalRegister(cmpOp2)) {
MachineInstr *def = MRI.getVRegDef(cmpOp2);
if (def->getOpcode() == TargetOpcode::COPY)
return false;
@@ -516,7 +516,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
jmpPos = MII;
jmpInstr = &MI;
predReg = MI.getOperand(0).getReg();
- afterRA = TargetRegisterInfo::isPhysicalRegister(predReg);
+ afterRA = Register::isPhysicalRegister(predReg);
// If ifconverter had not messed up with the kill flags of the
// operands, the following check on the kill flag would suffice.
@@ -603,7 +603,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
(isSecondOpReg &&
MI.getOperand(0).getReg() == (unsigned)cmpOp2))) {
- unsigned feederReg = MI.getOperand(0).getReg();
+ Register feederReg = MI.getOperand(0).getReg();
// First try to see if we can get the feeder from the first operand
// of the compare. If we can not, and if secondOpReg is true
@@ -651,7 +651,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
for (MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || !MO.isUse())
continue;
- unsigned UseR = MO.getReg();
+ Register UseR = MO.getReg();
for (auto I = std::next(MI.getIterator()); I != jmpPos; ++I) {
if (I == cmpPos)
continue;
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 547da9fd598f..9121115020a2 100644
--- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -162,7 +162,7 @@ bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN,
if (!OffsetOp.isImm() || OffsetOp.getImm() > 3)
return false;
- unsigned OffsetReg = MI.getOperand(2).getReg();
+ Register OffsetReg = MI.getOperand(2).getReg();
RegisterRef OffsetRR;
NodeId OffsetRegRD = 0;
for (NodeAddr<UseNode *> UA : AddAslSN.Addr->members_if(DFG->IsUse, *DFG)) {
@@ -348,7 +348,7 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN,
MachineInstr *AddMI,
const NodeList &UNodeList) {
- unsigned AddDefR = AddMI->getOperand(0).getReg();
+ Register AddDefR = AddMI->getOperand(0).getReg();
for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
NodeAddr<UseNode *> UN = *I;
NodeAddr<StmtNode *> SN = UN.Addr->getOwner(*DFG);
@@ -381,7 +381,7 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN,
// Ex: Rx= add(Rt,#10)
// memw(Rx+#0) = Rs
// will be replaced with => memw(Rt+#10) = Rs
- unsigned BaseReg = AddMI->getOperand(1).getReg();
+ Register BaseReg = AddMI->getOperand(1).getReg();
if (!isSafeToExtLR(AddSN, AddMI, BaseReg, UNodeList))
return false;
}
@@ -411,7 +411,7 @@ bool HexagonOptAddrMode::updateAddUses(MachineInstr *AddMI,
MachineInstr *UseMI) {
const MachineOperand ImmOp = AddMI->getOperand(2);
const MachineOperand AddRegOp = AddMI->getOperand(1);
- unsigned newReg = AddRegOp.getReg();
+ Register newReg = AddRegOp.getReg();
const MCInstrDesc &MID = UseMI->getDesc();
MachineOperand &BaseOp = MID.mayLoad() ? UseMI->getOperand(1)
@@ -543,7 +543,7 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
unsigned ImmOpNum) {
bool Changed = false;
- unsigned OpStart;
+ unsigned OpStart = 0;
unsigned OpEnd = OldMI->getNumOperands();
MachineBasicBlock *BB = OldMI->getParent();
auto UsePos = MachineBasicBlock::iterator(OldMI);
@@ -724,7 +724,7 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
}
short SizeInc = 0;
- unsigned DefR = MI->getOperand(0).getReg();
+ Register DefR = MI->getOperand(0).getReg();
InstrEvalMap InstrEvalResult;
// Analyze all uses and calculate increase in size. Perform the optimization
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index fb731f56bfbf..485e658e1c84 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -99,13 +99,21 @@ def HWI8: PatLeaf<(VecPI8 HvxWR:$R)>;
def HWI16: PatLeaf<(VecPI16 HvxWR:$R)>;
def HWI32: PatLeaf<(VecPI32 HvxWR:$R)>;
+def SDTVecLeaf:
+ SDTypeProfile<1, 0, [SDTCisVec<0>]>;
def SDTVecVecIntOp:
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>,
SDTCisVT<3,i32>]>;
+def HexagonPTRUE: SDNode<"HexagonISD::PTRUE", SDTVecLeaf>;
+def HexagonPFALSE: SDNode<"HexagonISD::PFALSE", SDTVecLeaf>;
def HexagonVALIGN: SDNode<"HexagonISD::VALIGN", SDTVecVecIntOp>;
def HexagonVALIGNADDR: SDNode<"HexagonISD::VALIGNADDR", SDTIntUnaryOp>;
+def ptrue: PatFrag<(ops), (HexagonPTRUE)>;
+def pfalse: PatFrag<(ops), (HexagonPFALSE)>;
+def pnot: PatFrag<(ops node:$Pu), (xor node:$Pu, ptrue)>;
+
def valign: PatFrag<(ops node:$Vt, node:$Vs, node:$Ru),
(HexagonVALIGN node:$Vt, node:$Vs, node:$Ru)>;
def valignaddr: PatFrag<(ops node:$Addr), (HexagonVALIGNADDR node:$Addr)>;
@@ -154,6 +162,11 @@ def IsNPow2_64H: PatLeaf<(i64 imm), [{
return isPowerOf2_64(NV) && Log2_64(NV) >= 32;
}]>;
+class IsULE<int Width, int Arg>: PatLeaf<(i32 imm),
+ "uint64_t V = N->getZExtValue();" #
+ "return isUInt<" # Width # ">(V) && V <= " # Arg # ";"
+>;
+
class IsUGT<int Width, int Arg>: PatLeaf<(i32 imm),
"uint64_t V = N->getZExtValue();" #
"return isUInt<" # Width # ">(V) && V > " # Arg # ";"
@@ -320,6 +333,24 @@ multiclass SelMinMax_pats<PatFrag CmpOp, PatFrag Val,
(InstB Val:$A, Val:$B)>;
}
+multiclass MinMax_pats<InstHexagon PickT, InstHexagon PickS,
+ PatFrag Sel, PatFrag CmpOp,
+ ValueType CmpType, PatFrag CmpPred> {
+ def: Pat<(Sel (CmpType (CmpOp CmpPred:$Vs, CmpPred:$Vt)),
+ CmpPred:$Vt, CmpPred:$Vs),
+ (PickT CmpPred:$Vs, CmpPred:$Vt)>;
+ def: Pat<(Sel (CmpType (CmpOp CmpPred:$Vs, CmpPred:$Vt)),
+ CmpPred:$Vs, CmpPred:$Vt),
+ (PickS CmpPred:$Vs, CmpPred:$Vt)>;
+}
+
+// Bitcasts between same-size vector types are no-ops, except for the
+// actual type change.
+multiclass NopCast_pat<ValueType Ty1, ValueType Ty2, RegisterClass RC> {
+ def: Pat<(Ty1 (bitconvert (Ty2 RC:$Val))), (Ty1 RC:$Val)>;
+ def: Pat<(Ty2 (bitconvert (Ty1 RC:$Val))), (Ty2 RC:$Val)>;
+}
+
// Frags for commonly used SDNodes.
def Add: pf2<add>; def And: pf2<and>; def Sra: pf2<sra>;
@@ -403,17 +434,18 @@ def: Pat<(f32 (bitconvert I32:$v)), (F32:$v)>;
def: Pat<(i64 (bitconvert F64:$v)), (I64:$v)>;
def: Pat<(f64 (bitconvert I64:$v)), (F64:$v)>;
-multiclass Cast_pat<ValueType Ta, ValueType Tb, RegisterClass RC> {
- def: Pat<(Tb (bitconvert (Ta RC:$Rs))), (Tb RC:$Rs)>;
- def: Pat<(Ta (bitconvert (Tb RC:$Rs))), (Ta RC:$Rs)>;
-}
-
-// Bit convert vector types to integers.
-defm: Cast_pat<v4i8, i32, IntRegs>;
-defm: Cast_pat<v2i16, i32, IntRegs>;
-defm: Cast_pat<v8i8, i64, DoubleRegs>;
-defm: Cast_pat<v4i16, i64, DoubleRegs>;
-defm: Cast_pat<v2i32, i64, DoubleRegs>;
+// Bit convert 32- and 64-bit types.
+// All of these are bitcastable to one another: i32, v2i16, v4i8.
+defm: NopCast_pat<i32, v2i16, IntRegs>;
+defm: NopCast_pat<i32, v4i8, IntRegs>;
+defm: NopCast_pat<v2i16, v4i8, IntRegs>;
+// All of these are bitcastable to one another: i64, v2i32, v4i16, v8i8.
+defm: NopCast_pat<i64, v2i32, DoubleRegs>;
+defm: NopCast_pat<i64, v4i16, DoubleRegs>;
+defm: NopCast_pat<i64, v8i8, DoubleRegs>;
+defm: NopCast_pat<v2i32, v4i16, DoubleRegs>;
+defm: NopCast_pat<v2i32, v8i8, DoubleRegs>;
+defm: NopCast_pat<v4i16, v8i8, DoubleRegs>;
// --(3) Extend/truncate -------------------------------------------------
@@ -497,7 +529,9 @@ def: Pat<(v2i16 (trunc V2I32:$Rs)),
//
def: Pat<(not I1:$Ps), (C2_not I1:$Ps)>;
-def: Pat<(not V8I1:$Ps), (C2_not V8I1:$Ps)>;
+def: Pat<(pnot V2I1:$Ps), (C2_not V2I1:$Ps)>;
+def: Pat<(pnot V4I1:$Ps), (C2_not V4I1:$Ps)>;
+def: Pat<(pnot V8I1:$Ps), (C2_not V8I1:$Ps)>;
def: Pat<(add I1:$Ps, -1), (C2_not I1:$Ps)>;
multiclass BoolOpR_RR_pat<InstHexagon MI, PatFrag Op> {
@@ -816,14 +850,6 @@ def: Pat<(select (not I1:$Pu), f32ImmPred:$I, F32:$Rs),
def: Pat<(select (not I1:$Pu), F32:$Rt, f32ImmPred:$I),
(C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
-def: Pat<(select I1:$Pu, V4I8:$Rs, V4I8:$Rt),
- (LoReg (C2_vmux I1:$Pu, (ToAext64 $Rs), (ToAext64 $Rt)))>;
-def: Pat<(select I1:$Pu, V2I16:$Rs, V2I16:$Rt),
- (LoReg (C2_vmux I1:$Pu, (ToAext64 $Rs), (ToAext64 $Rt)))>;
-def: Pat<(select I1:$Pu, V2I32:$Rs, V2I32:$Rt),
- (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
- (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
-
def: Pat<(vselect V8I1:$Pu, V8I8:$Rs, V8I8:$Rt),
(C2_vmux V8I1:$Pu, V8I8:$Rs, V8I8:$Rt)>;
def: Pat<(vselect V4I1:$Pu, V4I16:$Rs, V4I16:$Rt),
@@ -831,6 +857,14 @@ def: Pat<(vselect V4I1:$Pu, V4I16:$Rs, V4I16:$Rt),
def: Pat<(vselect V2I1:$Pu, V2I32:$Rs, V2I32:$Rt),
(C2_vmux V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)>;
+def: Pat<(vselect (pnot V8I1:$Pu), V8I8:$Rs, V8I8:$Rt),
+ (C2_vmux V8I1:$Pu, V8I8:$Rt, V8I8:$Rs)>;
+def: Pat<(vselect (pnot V4I1:$Pu), V4I16:$Rs, V4I16:$Rt),
+ (C2_vmux V4I1:$Pu, V4I16:$Rt, V4I16:$Rs)>;
+def: Pat<(vselect (pnot V2I1:$Pu), V2I32:$Rs, V2I32:$Rt),
+ (C2_vmux V2I1:$Pu, V2I32:$Rt, V2I32:$Rs)>;
+
+
// From LegalizeDAG.cpp: (Pu ? Pv : Pw) <=> (Pu & Pv) | (!Pu & Pw).
def: Pat<(select I1:$Pu, I1:$Pv, I1:$Pw),
(C2_or (C2_and I1:$Pu, I1:$Pv),
@@ -863,32 +897,44 @@ let AddedComplexity = 200 in {
}
let AddedComplexity = 200 in {
- defm: SelMinMax_pats<setge, I32, A2_max, A2_min>;
- defm: SelMinMax_pats<setgt, I32, A2_max, A2_min>;
- defm: SelMinMax_pats<setle, I32, A2_min, A2_max>;
- defm: SelMinMax_pats<setlt, I32, A2_min, A2_max>;
- defm: SelMinMax_pats<setuge, I32, A2_maxu, A2_minu>;
- defm: SelMinMax_pats<setugt, I32, A2_maxu, A2_minu>;
- defm: SelMinMax_pats<setule, I32, A2_minu, A2_maxu>;
- defm: SelMinMax_pats<setult, I32, A2_minu, A2_maxu>;
-
- defm: SelMinMax_pats<setge, I64, A2_maxp, A2_minp>;
- defm: SelMinMax_pats<setgt, I64, A2_maxp, A2_minp>;
- defm: SelMinMax_pats<setle, I64, A2_minp, A2_maxp>;
- defm: SelMinMax_pats<setlt, I64, A2_minp, A2_maxp>;
- defm: SelMinMax_pats<setuge, I64, A2_maxup, A2_minup>;
- defm: SelMinMax_pats<setugt, I64, A2_maxup, A2_minup>;
- defm: SelMinMax_pats<setule, I64, A2_minup, A2_maxup>;
- defm: SelMinMax_pats<setult, I64, A2_minup, A2_maxup>;
+ defm: MinMax_pats<A2_min, A2_max, select, setgt, i1, I32>;
+ defm: MinMax_pats<A2_min, A2_max, select, setge, i1, I32>;
+ defm: MinMax_pats<A2_max, A2_min, select, setlt, i1, I32>;
+ defm: MinMax_pats<A2_max, A2_min, select, setle, i1, I32>;
+ defm: MinMax_pats<A2_minu, A2_maxu, select, setugt, i1, I32>;
+ defm: MinMax_pats<A2_minu, A2_maxu, select, setuge, i1, I32>;
+ defm: MinMax_pats<A2_maxu, A2_minu, select, setult, i1, I32>;
+ defm: MinMax_pats<A2_maxu, A2_minu, select, setule, i1, I32>;
+
+ defm: MinMax_pats<A2_minp, A2_maxp, select, setgt, i1, I64>;
+ defm: MinMax_pats<A2_minp, A2_maxp, select, setge, i1, I64>;
+ defm: MinMax_pats<A2_maxp, A2_minp, select, setlt, i1, I64>;
+ defm: MinMax_pats<A2_maxp, A2_minp, select, setle, i1, I64>;
+ defm: MinMax_pats<A2_minup, A2_maxup, select, setugt, i1, I64>;
+ defm: MinMax_pats<A2_minup, A2_maxup, select, setuge, i1, I64>;
+ defm: MinMax_pats<A2_maxup, A2_minup, select, setult, i1, I64>;
+ defm: MinMax_pats<A2_maxup, A2_minup, select, setule, i1, I64>;
}
let AddedComplexity = 100 in {
- defm: SelMinMax_pats<setolt, F32, F2_sfmin, F2_sfmax>;
- defm: SelMinMax_pats<setole, F32, F2_sfmin, F2_sfmax>;
- defm: SelMinMax_pats<setogt, F32, F2_sfmax, F2_sfmin>;
- defm: SelMinMax_pats<setoge, F32, F2_sfmax, F2_sfmin>;
-}
-
+ defm: MinMax_pats<F2_sfmin, F2_sfmax, select, setogt, i1, F32>;
+ defm: MinMax_pats<F2_sfmin, F2_sfmax, select, setoge, i1, F32>;
+ defm: MinMax_pats<F2_sfmax, F2_sfmin, select, setolt, i1, F32>;
+ defm: MinMax_pats<F2_sfmax, F2_sfmin, select, setole, i1, F32>;
+}
+
+defm: MinMax_pats<A2_vminb, A2_vmaxb, vselect, setgt, v8i1, V8I8>;
+defm: MinMax_pats<A2_vminb, A2_vmaxb, vselect, setge, v8i1, V8I8>;
+defm: MinMax_pats<A2_vminh, A2_vmaxh, vselect, setgt, v4i1, V4I16>;
+defm: MinMax_pats<A2_vminh, A2_vmaxh, vselect, setge, v4i1, V4I16>;
+defm: MinMax_pats<A2_vminw, A2_vmaxw, vselect, setgt, v2i1, V2I32>;
+defm: MinMax_pats<A2_vminw, A2_vmaxw, vselect, setge, v2i1, V2I32>;
+defm: MinMax_pats<A2_vminub, A2_vmaxub, vselect, setugt, v8i1, V8I8>;
+defm: MinMax_pats<A2_vminub, A2_vmaxub, vselect, setuge, v8i1, V8I8>;
+defm: MinMax_pats<A2_vminuh, A2_vmaxuh, vselect, setugt, v4i1, V4I16>;
+defm: MinMax_pats<A2_vminuh, A2_vmaxuh, vselect, setuge, v4i1, V4I16>;
+defm: MinMax_pats<A2_vminuw, A2_vmaxuw, vselect, setugt, v2i1, V2I32>;
+defm: MinMax_pats<A2_vminuw, A2_vmaxuw, vselect, setuge, v2i1, V2I32>;
// --(7) Insert/extract --------------------------------------------------
//
@@ -1639,19 +1685,19 @@ def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
//
// Count leading zeros.
-def: Pat<(ctlz I32:$Rs), (S2_cl0 I32:$Rs)>;
+def: Pat<(i32 (ctlz I32:$Rs)), (S2_cl0 I32:$Rs)>;
def: Pat<(i32 (trunc (ctlz I64:$Rss))), (S2_cl0p I64:$Rss)>;
// Count trailing zeros.
-def: Pat<(cttz I32:$Rs), (S2_ct0 I32:$Rs)>;
+def: Pat<(i32 (cttz I32:$Rs)), (S2_ct0 I32:$Rs)>;
def: Pat<(i32 (trunc (cttz I64:$Rss))), (S2_ct0p I64:$Rss)>;
// Count leading ones.
-def: Pat<(ctlz (not I32:$Rs)), (S2_cl1 I32:$Rs)>;
+def: Pat<(i32 (ctlz (not I32:$Rs))), (S2_cl1 I32:$Rs)>;
def: Pat<(i32 (trunc (ctlz (not I64:$Rss)))), (S2_cl1p I64:$Rss)>;
// Count trailing ones.
-def: Pat<(cttz (not I32:$Rs)), (S2_ct1 I32:$Rs)>;
+def: Pat<(i32 (cttz (not I32:$Rs))), (S2_ct1 I32:$Rs)>;
def: Pat<(i32 (trunc (cttz (not I64:$Rss)))), (S2_ct1p I64:$Rss)>;
// Define leading/trailing patterns that require zero-extensions to 64 bits.
@@ -1706,6 +1752,7 @@ let AddedComplexity = 20 in { // Complexity greater than and/or/xor
(i32 (LoReg $Rss)))>;
}
+
let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm.
def: Pat<(i1 (setne (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
(S2_tstbit_i IntRegs:$Rs, imm:$u5)>;
@@ -1717,6 +1764,20 @@ let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm.
(S2_tstbit_i (LoReg DoubleRegs:$Rs), 0)>;
}
+def: Pat<(and (srl I32:$Rs, u5_0ImmPred:$u5), 1),
+ (I1toI32 (S2_tstbit_i I32:$Rs, imm:$u5))>;
+def: Pat<(and (srl I64:$Rss, IsULE<32,31>:$u6), 1),
+ (ToZext64 (I1toI32 (S2_tstbit_i (LoReg $Rss), imm:$u6)))>;
+def: Pat<(and (srl I64:$Rss, IsUGT<32,31>:$u6), 1),
+ (ToZext64 (I1toI32 (S2_tstbit_i (HiReg $Rss), (UDEC32 $u6))))>;
+
+def: Pat<(and (not (srl I32:$Rs, u5_0ImmPred:$u5)), 1),
+ (I1toI32 (S4_ntstbit_i I32:$Rs, imm:$u5))>;
+def: Pat<(and (not (srl I64:$Rss, IsULE<32,31>:$u6)), 1),
+ (ToZext64 (I1toI32 (S4_ntstbit_i (LoReg $Rss), imm:$u6)))>;
+def: Pat<(and (not (srl I64:$Rss, IsUGT<32,31>:$u6)), 1),
+ (ToZext64 (I1toI32 (S4_ntstbit_i (HiReg $Rss), (UDEC32 $u6))))>;
+
let AddedComplexity = 20 in { // Complexity greater than compare reg-imm.
def: Pat<(i1 (seteq (and I32:$Rs, u6_0ImmPred:$u6), 0)),
(C2_bitsclri IntRegs:$Rs, imm:$u6)>;
@@ -1737,23 +1798,28 @@ def: Pat<(HexagonTSTBIT I32:$Rs, u5_0ImmPred:$u5),
def: Pat<(HexagonTSTBIT I32:$Rs, I32:$Rt),
(S2_tstbit_r I32:$Rs, I32:$Rt)>;
+// Add extra complexity to prefer these instructions over bitsset/bitsclr.
+// The reason is that tstbit/ntstbit can be folded into a compound instruction:
+// if ([!]tstbit(...)) jump ...
let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm.
- def: Pat<(i1 (seteq (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
- (S4_ntstbit_i I32:$Rs, imm:$u5)>;
+ def: Pat<(i1 (seteq (and I32:$Rs, IsPow2_32:$u5), 0)),
+ (S4_ntstbit_i I32:$Rs, (Log2_32 imm:$u5))>;
+ def: Pat<(i1 (setne (and I32:$Rs, IsPow2_32:$u5), 0)),
+ (S2_tstbit_i I32:$Rs, (Log2_32 imm:$u5))>;
def: Pat<(i1 (seteq (and (shl 1, I32:$Rt), I32:$Rs), 0)),
(S4_ntstbit_r I32:$Rs, I32:$Rt)>;
+ def: Pat<(i1 (setne (and (shl 1, I32:$Rt), I32:$Rs), 0)),
+ (S2_tstbit_r I32:$Rs, I32:$Rt)>;
}
-// Add extra complexity to prefer these instructions over bitsset/bitsclr.
-// The reason is that tstbit/ntstbit can be folded into a compound instruction:
-// if ([!]tstbit(...)) jump ...
-let AddedComplexity = 100 in
-def: Pat<(i1 (setne (and I32:$Rs, (i32 IsPow2_32:$u5)), (i32 0))),
- (S2_tstbit_i I32:$Rs, (Log2_32 imm:$u5))>;
-
-let AddedComplexity = 100 in
-def: Pat<(i1 (seteq (and I32:$Rs, (i32 IsPow2_32:$u5)), (i32 0))),
- (S4_ntstbit_i I32:$Rs, (Log2_32 imm:$u5))>;
+def: Pat<(i1 (seteq (and I64:$Rs, IsPow2_64L:$u6), 0)),
+ (S4_ntstbit_i (LoReg $Rs), (Log2_64 $u6))>;
+def: Pat<(i1 (seteq (and I64:$Rs, IsPow2_64H:$u6), 0)),
+ (S4_ntstbit_i (HiReg $Rs), (UDEC32 (i32 (Log2_64 $u6))))>;
+def: Pat<(i1 (setne (and I64:$Rs, IsPow2_64L:$u6), 0)),
+ (S2_tstbit_i (LoReg $Rs), (Log2_32 imm:$u6))>;
+def: Pat<(i1 (setne (and I64:$Rs, IsPow2_64H:$u6), 0)),
+ (S2_tstbit_i (HiReg $Rs), (UDEC32 (i32 (Log2_32 imm:$u6))))>;
// Do not increase complexity of these patterns. In the DAG, "cmp i8" may be
// represented as a compare against "value & 0xFF", which is an exact match
@@ -1773,10 +1839,18 @@ def: Pat<(i1 (setne (and I32:$Rs, I32:$Rt), I32:$Rt)),
let AddedComplexity = 100 in {
// Avoid A4_rcmp[n]eqi in these cases:
+ def: Pat<(i32 (zext (i1 (seteq (and (shl 1, I32:$Rt), I32:$Rs), 0)))),
+ (I1toI32 (S4_ntstbit_r IntRegs:$Rs, IntRegs:$Rt))>;
def: Pat<(i32 (zext (i1 (setne (and (shl 1, I32:$Rt), I32:$Rs), 0)))),
(I1toI32 (S2_tstbit_r IntRegs:$Rs, IntRegs:$Rt))>;
+ def: Pat<(i32 (zext (i1 (seteq (and I32:$Rs, IsPow2_32:$u5), 0)))),
+ (I1toI32 (S4_ntstbit_i I32:$Rs, (Log2_32 imm:$u5)))>;
+ def: Pat<(i32 (zext (i1 (setne (and I32:$Rs, IsPow2_32:$u5), 0)))),
+ (I1toI32 (S2_tstbit_i I32:$Rs, (Log2_32 imm:$u5)))>;
def: Pat<(i32 (zext (i1 (seteq (and (shl 1, I32:$Rt), I32:$Rs), 0)))),
- (I1toI32 (S4_ntstbit_r IntRegs:$Rs, IntRegs:$Rt))>;
+ (I1toI32 (S4_ntstbit_r I32:$Rs, I32:$Rt))>;
+ def: Pat<(i32 (zext (i1 (setne (and (shl 1, I32:$Rt), I32:$Rs), 0)))),
+ (I1toI32 (S2_tstbit_r I32:$Rs, I32:$Rt))>;
}
// --(11) PIC ------------------------------------------------------------
diff --git a/lib/Target/Hexagon/HexagonPatternsHVX.td b/lib/Target/Hexagon/HexagonPatternsHVX.td
index a4cfca9ac7d7..078a7135c55b 100644
--- a/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -1,5 +1,3 @@
-def SDTVecLeaf:
- SDTypeProfile<1, 0, [SDTCisVec<0>]>;
def SDTVecBinOp:
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>]>;
@@ -162,23 +160,14 @@ let Predicates = [UseHVX] in {
// Bitcasts between same-size vector types are no-ops, except for the
// actual type change.
-class Bitcast<ValueType ResTy, ValueType InpTy, RegisterClass RC>
- : Pat<(ResTy (bitconvert (InpTy RC:$Val))), (ResTy RC:$Val)>;
-
let Predicates = [UseHVX] in {
- def: Bitcast<VecI8, VecI16, HvxVR>;
- def: Bitcast<VecI8, VecI32, HvxVR>;
- def: Bitcast<VecI16, VecI8, HvxVR>;
- def: Bitcast<VecI16, VecI32, HvxVR>;
- def: Bitcast<VecI32, VecI8, HvxVR>;
- def: Bitcast<VecI32, VecI16, HvxVR>;
-
- def: Bitcast<VecPI8, VecPI16, HvxWR>;
- def: Bitcast<VecPI8, VecPI32, HvxWR>;
- def: Bitcast<VecPI16, VecPI8, HvxWR>;
- def: Bitcast<VecPI16, VecPI32, HvxWR>;
- def: Bitcast<VecPI32, VecPI8, HvxWR>;
- def: Bitcast<VecPI32, VecPI16, HvxWR>;
+ defm: NopCast_pat<VecI8, VecI16, HvxVR>;
+ defm: NopCast_pat<VecI8, VecI32, HvxVR>;
+ defm: NopCast_pat<VecI16, VecI32, HvxVR>;
+
+ defm: NopCast_pat<VecPI8, VecPI16, HvxWR>;
+ defm: NopCast_pat<VecPI8, VecPI32, HvxWR>;
+ defm: NopCast_pat<VecPI16, VecPI32, HvxWR>;
}
let Predicates = [UseHVX] in {
@@ -260,6 +249,21 @@ class Vnot<ValueType VecTy>
: PatFrag<(ops node:$Vs), (xor $Vs, Vneg1<VecTy>)>;
let Predicates = [UseHVX] in {
+ let AddedComplexity = 220 in {
+ defm: MinMax_pats<V6_vminb, V6_vmaxb, vselect, setgt, VecQ8, HVI8>;
+ defm: MinMax_pats<V6_vminb, V6_vmaxb, vselect, setge, VecQ8, HVI8>;
+ defm: MinMax_pats<V6_vminub, V6_vmaxub, vselect, setugt, VecQ8, HVI8>;
+ defm: MinMax_pats<V6_vminub, V6_vmaxub, vselect, setuge, VecQ8, HVI8>;
+ defm: MinMax_pats<V6_vminh, V6_vmaxh, vselect, setgt, VecQ16, HVI16>;
+ defm: MinMax_pats<V6_vminh, V6_vmaxh, vselect, setge, VecQ16, HVI16>;
+ defm: MinMax_pats<V6_vminuh, V6_vmaxuh, vselect, setugt, VecQ16, HVI16>;
+ defm: MinMax_pats<V6_vminuh, V6_vmaxuh, vselect, setuge, VecQ16, HVI16>;
+ defm: MinMax_pats<V6_vminw, V6_vmaxw, vselect, setgt, VecQ32, HVI32>;
+ defm: MinMax_pats<V6_vminw, V6_vmaxw, vselect, setge, VecQ32, HVI32>;
+ }
+}
+
+let Predicates = [UseHVX] in {
let AddedComplexity = 200 in {
def: Pat<(Vnot<VecI8> HVI8:$Vs), (V6_vnot HvxVR:$Vs)>;
def: Pat<(Vnot<VecI16> HVI16:$Vs), (V6_vnot HvxVR:$Vs)>;
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 8f761d2d4805..0ccfe64ad1e5 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -136,11 +136,11 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
assert(MI.getNumOperands() == 2);
MachineOperand &Dst = MI.getOperand(0);
MachineOperand &Src = MI.getOperand(1);
- unsigned DstReg = Dst.getReg();
- unsigned SrcReg = Src.getReg();
+ Register DstReg = Dst.getReg();
+ Register SrcReg = Src.getReg();
// Just handle virtual registers.
- if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
- TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ if (Register::isVirtualRegister(DstReg) &&
+ Register::isVirtualRegister(SrcReg)) {
// Map the following:
// %170 = SXTW %166
// PeepholeMap[170] = %166
@@ -157,8 +157,8 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
MachineOperand &Src2 = MI.getOperand(2);
if (Src1.getImm() != 0)
continue;
- unsigned DstReg = Dst.getReg();
- unsigned SrcReg = Src2.getReg();
+ Register DstReg = Dst.getReg();
+ Register SrcReg = Src2.getReg();
PeepholeMap[DstReg] = SrcReg;
}
@@ -174,8 +174,8 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
MachineOperand &Src2 = MI.getOperand(2);
if (Src2.getImm() != 32)
continue;
- unsigned DstReg = Dst.getReg();
- unsigned SrcReg = Src1.getReg();
+ Register DstReg = Dst.getReg();
+ Register SrcReg = Src1.getReg();
PeepholeDoubleRegsMap[DstReg] =
std::make_pair(*&SrcReg, Hexagon::isub_hi);
}
@@ -185,11 +185,11 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
assert(MI.getNumOperands() == 2);
MachineOperand &Dst = MI.getOperand(0);
MachineOperand &Src = MI.getOperand(1);
- unsigned DstReg = Dst.getReg();
- unsigned SrcReg = Src.getReg();
+ Register DstReg = Dst.getReg();
+ Register SrcReg = Src.getReg();
// Just handle virtual registers.
- if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
- TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ if (Register::isVirtualRegister(DstReg) &&
+ Register::isVirtualRegister(SrcReg)) {
// Map the following:
// %170 = NOT_xx %166
// PeepholeMap[170] = %166
@@ -208,10 +208,10 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
if (Src.getSubReg() != Hexagon::isub_lo)
continue;
- unsigned DstReg = Dst.getReg();
- unsigned SrcReg = Src.getReg();
- if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
- TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ Register DstReg = Dst.getReg();
+ Register SrcReg = Src.getReg();
+ if (Register::isVirtualRegister(DstReg) &&
+ Register::isVirtualRegister(SrcReg)) {
// Try to find in the map.
if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) {
// Change the 1st operand.
@@ -237,12 +237,12 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
bool Done = false;
if (QII->isPredicated(MI)) {
MachineOperand &Op0 = MI.getOperand(0);
- unsigned Reg0 = Op0.getReg();
+ Register Reg0 = Op0.getReg();
const TargetRegisterClass *RC0 = MRI->getRegClass(Reg0);
if (RC0->getID() == Hexagon::PredRegsRegClassID) {
// Handle instructions that have a prediate register in op0
// (most cases of predicable instructions).
- if (TargetRegisterInfo::isVirtualRegister(Reg0)) {
+ if (Register::isVirtualRegister(Reg0)) {
// Try to find in the map.
if (unsigned PeepholeSrc = PeepholeMap.lookup(Reg0)) {
// Change the 1st operand and, flip the opcode.
@@ -275,7 +275,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
break;
}
if (NewOp) {
- unsigned PSrc = MI.getOperand(PR).getReg();
+ Register PSrc = MI.getOperand(PR).getReg();
if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(),
QII->get(NewOp), MI.getOperand(0).getReg())
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 4f5f750e5842..b7171fb14272 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -217,7 +217,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// If the offset is not valid, calculate the address in a temporary
// register and use it with offset 0.
auto &MRI = MF.getRegInfo();
- unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(MB, II, DL, HII.get(Hexagon::A2_addi), TmpR)
.addReg(BP)
@@ -249,8 +249,8 @@ bool HexagonRegisterInfo::shouldCoalesce(MachineInstr *MI,
if (!SmallSrc && !SmallDst)
return true;
- unsigned DstReg = MI->getOperand(0).getReg();
- unsigned SrcReg = MI->getOperand(1).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
+ Register SrcReg = MI->getOperand(1).getReg();
const SlotIndexes &Indexes = *LIS.getSlotIndexes();
auto HasCall = [&Indexes] (const LiveInterval::Segment &S) {
for (SlotIndex I = S.start.getBaseIndex(), E = S.end.getBaseIndex();
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index bd4254aea276..f9fb14c190ff 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -76,18 +76,18 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
unsigned Opc = MI.getOpcode();
if (Opc == Hexagon::CONST32) {
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
uint64_t ImmValue = MI.getOperand(1).getImm();
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), DestReg)
.addImm(ImmValue);
B.erase(&MI);
} else if (Opc == Hexagon::CONST64) {
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
int64_t ImmValue = MI.getOperand(1).getImm();
const DebugLoc &DL = MI.getDebugLoc();
- unsigned DestLo = TRI->getSubReg(DestReg, Hexagon::isub_lo);
- unsigned DestHi = TRI->getSubReg(DestReg, Hexagon::isub_hi);
+ Register DestLo = TRI->getSubReg(DestReg, Hexagon::isub_lo);
+ Register DestHi = TRI->getSubReg(DestReg, Hexagon::isub_hi);
int32_t LowWord = (ImmValue & 0xFFFFFFFF);
int32_t HighWord = (ImmValue >> 32) & 0xFFFFFFFF;
diff --git a/lib/Target/Hexagon/HexagonSplitDouble.cpp b/lib/Target/Hexagon/HexagonSplitDouble.cpp
index 013eede2d414..55f31c628854 100644
--- a/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -210,8 +210,8 @@ bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const {
for (auto &Op : MI->operands()) {
if (!Op.isReg())
continue;
- unsigned R = Op.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ Register R = Op.getReg();
+ if (!Register::isVirtualRegister(R))
return true;
}
return false;
@@ -224,14 +224,14 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
unsigned NumRegs = MRI->getNumVirtRegs();
BitVector DoubleRegs(NumRegs);
for (unsigned i = 0; i < NumRegs; ++i) {
- unsigned R = TargetRegisterInfo::index2VirtReg(i);
+ unsigned R = Register::index2VirtReg(i);
if (MRI->getRegClass(R) == DoubleRC)
DoubleRegs.set(i);
}
BitVector FixedRegs(NumRegs);
for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
- unsigned R = TargetRegisterInfo::index2VirtReg(x);
+ unsigned R = Register::index2VirtReg(x);
MachineInstr *DefI = MRI->getVRegDef(R);
// In some cases a register may exist, but never be defined or used.
// It should never appear anywhere, but mark it as "fixed", just to be
@@ -244,7 +244,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
if (FixedRegs[x])
continue;
- unsigned R = TargetRegisterInfo::index2VirtReg(x);
+ unsigned R = Register::index2VirtReg(x);
LLVM_DEBUG(dbgs() << printReg(R, TRI) << " ~~");
USet &Asc = AssocMap[R];
for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end();
@@ -258,14 +258,14 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
// Skip non-registers or registers with subregisters.
if (&MO == &Op || !MO.isReg() || MO.getSubReg())
continue;
- unsigned T = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(T)) {
+ Register T = MO.getReg();
+ if (!Register::isVirtualRegister(T)) {
FixedRegs.set(x);
continue;
}
if (MRI->getRegClass(T) != DoubleRC)
continue;
- unsigned u = TargetRegisterInfo::virtReg2Index(T);
+ unsigned u = Register::virtReg2Index(T);
if (FixedRegs[u])
continue;
LLVM_DEBUG(dbgs() << ' ' << printReg(T, TRI));
@@ -281,7 +281,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
unsigned NextP = 1;
USet Visited;
for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
- unsigned R = TargetRegisterInfo::index2VirtReg(x);
+ unsigned R = Register::index2VirtReg(x);
if (Visited.count(R))
continue;
// Create a new partition for R.
@@ -372,8 +372,8 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
case Hexagon::A2_andp:
case Hexagon::A2_orp:
case Hexagon::A2_xorp: {
- unsigned Rs = MI->getOperand(1).getReg();
- unsigned Rt = MI->getOperand(2).getReg();
+ Register Rs = MI->getOperand(1).getReg();
+ Register Rt = MI->getOperand(2).getReg();
return profit(Rs) + profit(Rt);
}
@@ -400,7 +400,7 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
}
int32_t HexagonSplitDoubleRegs::profit(unsigned Reg) const {
- assert(TargetRegisterInfo::isVirtualRegister(Reg));
+ assert(Register::isVirtualRegister(Reg));
const MachineInstr *DefI = MRI->getVRegDef(Reg);
switch (DefI->getOpcode()) {
@@ -499,7 +499,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
return;
assert(Cond[1].isReg() && "Unexpected Cond vector from analyzeBranch");
// Expect a predicate register.
- unsigned PR = Cond[1].getReg();
+ Register PR = Cond[1].getReg();
assert(MRI->getRegClass(PR) == &Hexagon::PredRegsRegClass);
// Get the registers on which the loop controlling compare instruction
@@ -535,7 +535,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
if (!MI.isPHI())
break;
const MachineOperand &MD = MI.getOperand(0);
- unsigned R = MD.getReg();
+ Register R = MD.getReg();
if (MRI->getRegClass(R) == DoubleRC)
DP.push_back(R);
}
@@ -551,7 +551,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
// Get the output from the add. If it is one of the inputs to the
// loop-controlling compare instruction, then R is likely an induc-
// tion register.
- unsigned T = UseI->getOperand(0).getReg();
+ Register T = UseI->getOperand(0).getReg();
if (T == CmpR1 || T == CmpR2)
return false;
}
@@ -603,9 +603,9 @@ void HexagonSplitDoubleRegs::createHalfInstr(unsigned Opc, MachineInstr *MI,
continue;
}
// For register operands, set the subregister.
- unsigned R = Op.getReg();
+ Register R = Op.getReg();
unsigned SR = Op.getSubReg();
- bool isVirtReg = TargetRegisterInfo::isVirtualRegister(R);
+ bool isVirtReg = Register::isVirtualRegister(R);
bool isKill = Op.isKill();
if (isVirtReg && MRI->getRegClass(R) == DoubleRC) {
isKill = false;
@@ -674,7 +674,7 @@ void HexagonSplitDoubleRegs::splitMemRef(MachineInstr *MI,
: MI->getOperand(2).getImm();
MachineOperand &UpdOp = Load ? MI->getOperand(1) : MI->getOperand(0);
const TargetRegisterClass *RC = MRI->getRegClass(UpdOp.getReg());
- unsigned NewR = MRI->createVirtualRegister(RC);
+ Register NewR = MRI->createVirtualRegister(RC);
assert(!UpdOp.getSubReg() && "Def operand with subreg");
BuildMI(B, MI, DL, TII->get(Hexagon::A2_addi), NewR)
.addReg(AdrOp.getReg(), RSA)
@@ -789,8 +789,8 @@ void HexagonSplitDoubleRegs::splitShift(MachineInstr *MI,
UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
assert(F != PairMap.end());
const UUPair &P = F->second;
- unsigned LoR = P.first;
- unsigned HiR = P.second;
+ Register LoR = P.first;
+ Register HiR = P.second;
unsigned Opc = MI->getOpcode();
bool Right = (Opc == S2_lsr_i_p || Opc == S2_asr_i_p);
@@ -813,7 +813,7 @@ void HexagonSplitDoubleRegs::splitShift(MachineInstr *MI,
.addReg(Op1.getReg(), RS, HiSR);
} else if (S < 32) {
const TargetRegisterClass *IntRC = &IntRegsRegClass;
- unsigned TmpR = MRI->createVirtualRegister(IntRC);
+ Register TmpR = MRI->createVirtualRegister(IntRC);
// Expansion:
// Shift left: DR = shl R, #s
// LoR = shl R.lo, #s
@@ -953,12 +953,12 @@ void HexagonSplitDoubleRegs::splitAslOr(MachineInstr *MI,
.addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR)
.addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR)
.addImm(S);
- unsigned TmpR1 = MRI->createVirtualRegister(IntRC);
+ Register TmpR1 = MRI->createVirtualRegister(IntRC);
BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR1)
.addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR)
.addImm(S)
.addImm(32-S);
- unsigned TmpR2 = MRI->createVirtualRegister(IntRC);
+ Register TmpR2 = MRI->createVirtualRegister(IntRC);
BuildMI(B, MI, DL, TII->get(A2_or), TmpR2)
.addReg(Op1.getReg(), RS1, HiSR)
.addReg(TmpR1);
@@ -1002,7 +1002,7 @@ bool HexagonSplitDoubleRegs::splitInstr(MachineInstr *MI,
switch (Opc) {
case TargetOpcode::PHI:
case TargetOpcode::COPY: {
- unsigned DstR = MI->getOperand(0).getReg();
+ Register DstR = MI->getOperand(0).getReg();
if (MRI->getRegClass(DstR) == DoubleRC) {
createHalfInstr(Opc, MI, PairMap, isub_lo);
createHalfInstr(Opc, MI, PairMap, isub_hi);
@@ -1079,7 +1079,7 @@ void HexagonSplitDoubleRegs::replaceSubregUses(MachineInstr *MI,
for (auto &Op : MI->operands()) {
if (!Op.isReg() || !Op.isUse() || !Op.getSubReg())
continue;
- unsigned R = Op.getReg();
+ Register R = Op.getReg();
UUPairMap::const_iterator F = PairMap.find(R);
if (F == PairMap.end())
continue;
@@ -1104,8 +1104,8 @@ void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI,
for (auto &Op : MI->operands()) {
if (!Op.isReg() || !Op.isUse())
continue;
- unsigned R = Op.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(R))
+ Register R = Op.getReg();
+ if (!Register::isVirtualRegister(R))
continue;
if (MRI->getRegClass(R) != DoubleRC || Op.getSubReg())
continue;
@@ -1113,7 +1113,7 @@ void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI,
if (F == PairMap.end())
continue;
const UUPair &Pr = F->second;
- unsigned NewDR = MRI->createVirtualRegister(DoubleRC);
+ Register NewDR = MRI->createVirtualRegister(DoubleRC);
BuildMI(B, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), NewDR)
.addReg(Pr.first)
.addImm(Hexagon::isub_lo)
@@ -1145,8 +1145,8 @@ bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) {
U != W; ++U)
SplitIns.insert(U->getParent());
- unsigned LoR = MRI->createVirtualRegister(IntRC);
- unsigned HiR = MRI->createVirtualRegister(IntRC);
+ Register LoR = MRI->createVirtualRegister(IntRC);
+ Register HiR = MRI->createVirtualRegister(IntRC);
LLVM_DEBUG(dbgs() << "Created mapping: " << printReg(DR, TRI) << " -> "
<< printReg(HiR, TRI) << ':' << printReg(LoR, TRI)
<< '\n');
diff --git a/lib/Target/Hexagon/HexagonStoreWidening.cpp b/lib/Target/Hexagon/HexagonStoreWidening.cpp
index b8b61517ff95..27fefa5f5e2b 100644
--- a/lib/Target/Hexagon/HexagonStoreWidening.cpp
+++ b/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -441,7 +441,7 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
// Create vreg = A2_tfrsi #Acc; mem[hw] = vreg
const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF);
- unsigned VReg = MF->getRegInfo().createVirtualRegister(RC);
+ Register VReg = MF->getRegInfo().createVirtualRegister(RC);
MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg)
.addImm(int(Acc));
NG.push_back(TfrI);
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 7ec63a642b0c..6c706fea096b 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -119,7 +119,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
FeatureBitset Features = getFeatureBits();
if (HexagonDisableDuplex)
- setFeatureBits(Features.set(Hexagon::FeatureDuplex, false));
+ setFeatureBits(Features.reset(Hexagon::FeatureDuplex));
setFeatureBits(Hexagon_MC::completeHVXFeatures(Features));
return *this;
@@ -230,7 +230,7 @@ void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
else if (SchedRetvalOptimization) {
const MachineInstr *MI = DAG->SUnits[su].getInstr();
if (MI->isCopy() &&
- TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
+ Register::isPhysicalRegister(MI->getOperand(1).getReg())) {
// %vregX = COPY %r0
VRegHoldingReg[MI->getOperand(0).getReg()] = MI->getOperand(1).getReg();
LastVRegUse.erase(MI->getOperand(1).getReg());
@@ -243,8 +243,7 @@ void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
VRegHoldingReg.count(MO.getReg())) {
// <use of %vregX>
LastVRegUse[VRegHoldingReg[MO.getReg()]] = &DAG->SUnits[su];
- } else if (MO.isDef() &&
- TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ } else if (MO.isDef() && Register::isPhysicalRegister(MO.getReg())) {
for (MCRegAliasIterator AI(MO.getReg(), &TRI, true); AI.isValid();
++AI) {
if (LastVRegUse.count(*AI) &&
@@ -345,7 +344,7 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
// If it's a REG_SEQUENCE/COPY, use its destination instruction to determine
// the correct latency.
if ((DstInst->isRegSequence() || DstInst->isCopy()) && Dst->NumSuccs == 1) {
- unsigned DReg = DstInst->getOperand(0).getReg();
+ Register DReg = DstInst->getOperand(0).getReg();
MachineInstr *DDst = Dst->Succs[0].getSUnit()->getInstr();
unsigned UseIdx = -1;
for (unsigned OpNum = 0; OpNum < DDst->getNumOperands(); OpNum++) {
@@ -375,15 +374,15 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
void HexagonSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
- Mutations.push_back(llvm::make_unique<UsrOverflowMutation>());
- Mutations.push_back(llvm::make_unique<HVXMemLatencyMutation>());
- Mutations.push_back(llvm::make_unique<BankConflictMutation>());
+ Mutations.push_back(std::make_unique<UsrOverflowMutation>());
+ Mutations.push_back(std::make_unique<HVXMemLatencyMutation>());
+ Mutations.push_back(std::make_unique<BankConflictMutation>());
}
void HexagonSubtarget::getSMSMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
- Mutations.push_back(llvm::make_unique<UsrOverflowMutation>());
- Mutations.push_back(llvm::make_unique<HVXMemLatencyMutation>());
+ Mutations.push_back(std::make_unique<UsrOverflowMutation>());
+ Mutations.push_back(std::make_unique<HVXMemLatencyMutation>());
}
// Pin the vtable to this file.
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 007423ef1902..31157a0065d9 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -228,7 +228,7 @@ public:
}
bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const {
- if (!VecTy.isVector() || !useHVXOps())
+ if (!VecTy.isVector() || !useHVXOps() || VecTy.isScalableVector())
return false;
MVT ElemTy = VecTy.getVectorElementType();
if (!IncludeBool && ElemTy == MVT::i1)
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 80b8480448fe..d709a82be660 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -111,10 +111,10 @@ int HexagonTargetMachineModule = 0;
static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
- new VLIWMachineScheduler(C, make_unique<ConvergingVLIWScheduler>());
- DAG->addMutation(make_unique<HexagonSubtarget::UsrOverflowMutation>());
- DAG->addMutation(make_unique<HexagonSubtarget::HVXMemLatencyMutation>());
- DAG->addMutation(make_unique<HexagonSubtarget::CallMutation>());
+ new VLIWMachineScheduler(C, std::make_unique<ConvergingVLIWScheduler>());
+ DAG->addMutation(std::make_unique<HexagonSubtarget::UsrOverflowMutation>());
+ DAG->addMutation(std::make_unique<HexagonSubtarget::HVXMemLatencyMutation>());
+ DAG->addMutation(std::make_unique<HexagonSubtarget::CallMutation>());
DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
@@ -218,7 +218,7 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
TT, CPU, FS, Options, getEffectiveRelocModel(RM),
getEffectiveCodeModel(CM, CodeModel::Small),
(HexagonNoOpt ? CodeGenOpt::None : OL)),
- TLOF(make_unique<HexagonTargetObjectFile>()) {
+ TLOF(std::make_unique<HexagonTargetObjectFile>()) {
initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
initAsmInfo();
}
@@ -244,7 +244,7 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<HexagonSubtarget>(TargetTriple, CPU, FS, *this);
+ I = std::make_unique<HexagonSubtarget>(TargetTriple, CPU, FS, *this);
}
return I.get();
}
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 38062e8e922c..ddbc5543348d 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -45,6 +45,8 @@ bool HexagonTTIImpl::useHVX() const {
bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
assert(VecTy->isVectorTy());
+ if (cast<VectorType>(VecTy)->isScalable())
+ return false;
// Avoid types like <2 x i32*>.
if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
return false;
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 27e8fc019007..12ede503af83 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -68,8 +68,8 @@ public:
bool shouldFavorPostInc() const;
// L1 cache prefetch.
- unsigned getPrefetchDistance() const;
- unsigned getCacheLineSize() const;
+ unsigned getPrefetchDistance() const override;
+ unsigned getCacheLineSize() const override;
/// @}
diff --git a/lib/Target/Hexagon/HexagonVExtract.cpp b/lib/Target/Hexagon/HexagonVExtract.cpp
index a9692f42e468..0c0266a6839a 100644
--- a/lib/Target/Hexagon/HexagonVExtract.cpp
+++ b/lib/Target/Hexagon/HexagonVExtract.cpp
@@ -67,9 +67,9 @@ unsigned HexagonVExtract::genElemLoad(MachineInstr *ExtI, unsigned BaseR,
MachineRegisterInfo &MRI) {
MachineBasicBlock &ExtB = *ExtI->getParent();
DebugLoc DL = ExtI->getDebugLoc();
- unsigned ElemR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register ElemR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
- unsigned ExtIdxR = ExtI->getOperand(2).getReg();
+ Register ExtIdxR = ExtI->getOperand(2).getReg();
unsigned ExtIdxS = ExtI->getOperand(2).getSubReg();
// Simplified check for a compile-time constant value of ExtIdxR.
@@ -86,7 +86,7 @@ unsigned HexagonVExtract::genElemLoad(MachineInstr *ExtI, unsigned BaseR,
}
}
- unsigned IdxR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register IdxR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::A2_andir), IdxR)
.add(ExtI->getOperand(2))
.addImm(-4);
@@ -111,7 +111,7 @@ bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) {
unsigned Opc = MI.getOpcode();
if (Opc != Hexagon::V6_extractw)
continue;
- unsigned VecR = MI.getOperand(1).getReg();
+ Register VecR = MI.getOperand(1).getReg();
VExtractMap[VecR].push_back(&MI);
}
}
@@ -144,13 +144,13 @@ bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) {
MachineBasicBlock &ExtB = *ExtI->getParent();
DebugLoc DL = ExtI->getDebugLoc();
- unsigned BaseR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ Register BaseR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
BuildMI(ExtB, ExtI, DL, HII->get(Hexagon::PS_fi), BaseR)
.addFrameIndex(FI)
.addImm(SR == 0 ? 0 : VecSize/2);
unsigned ElemR = genElemLoad(ExtI, BaseR, MRI);
- unsigned ExtR = ExtI->getOperand(0).getReg();
+ Register ExtR = ExtI->getOperand(0).getReg();
MRI.replaceRegWith(ExtR, ElemR);
ExtB.erase(ExtI);
Changed = true;
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 3619e4c239d7..fab5edefb553 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -24,6 +24,7 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
@@ -57,9 +58,9 @@ static cl::opt<bool> DisablePacketizer("disable-packetizer", cl::Hidden,
cl::ZeroOrMore, cl::init(false),
cl::desc("Disable Hexagon packetizer pass"));
-cl::opt<bool> Slot1Store("slot1-store-slot0-load", cl::Hidden,
- cl::ZeroOrMore, cl::init(true),
- cl::desc("Allow slot1 store and slot0 load"));
+static cl::opt<bool> Slot1Store("slot1-store-slot0-load", cl::Hidden,
+ cl::ZeroOrMore, cl::init(true),
+ cl::desc("Allow slot1 store and slot0 load"));
static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles",
cl::ZeroOrMore, cl::Hidden, cl::init(true),
@@ -129,16 +130,16 @@ INITIALIZE_PASS_END(HexagonPacketizer, "hexagon-packetizer",
"Hexagon Packetizer", false, false)
HexagonPacketizerList::HexagonPacketizerList(MachineFunction &MF,
- MachineLoopInfo &MLI, AliasAnalysis *AA,
+ MachineLoopInfo &MLI, AAResults *AA,
const MachineBranchProbabilityInfo *MBPI, bool Minimal)
: VLIWPacketizerList(MF, MLI, AA), MBPI(MBPI), MLI(&MLI),
Minimal(Minimal) {
HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
- addMutation(llvm::make_unique<HexagonSubtarget::UsrOverflowMutation>());
- addMutation(llvm::make_unique<HexagonSubtarget::HVXMemLatencyMutation>());
- addMutation(llvm::make_unique<HexagonSubtarget::BankConflictMutation>());
+ addMutation(std::make_unique<HexagonSubtarget::UsrOverflowMutation>());
+ addMutation(std::make_unique<HexagonSubtarget::HVXMemLatencyMutation>());
+ addMutation(std::make_unique<HexagonSubtarget::BankConflictMutation>());
}
// Check if FirstI modifies a register that SecondI reads.
@@ -148,7 +149,7 @@ static bool hasWriteToReadDep(const MachineInstr &FirstI,
for (auto &MO : FirstI.operands()) {
if (!MO.isReg() || !MO.isDef())
continue;
- unsigned R = MO.getReg();
+ Register R = MO.getReg();
if (SecondI.readsRegister(R, TRI))
return true;
}
@@ -422,7 +423,7 @@ bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr &MI,
dbgs() << "Checking CUR against ";
MJ.dump();
});
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
bool FoundMatch = false;
for (auto &MO : MJ.operands())
if (MO.isReg() && MO.getReg() == DestReg)
@@ -515,7 +516,7 @@ bool HexagonPacketizerList::updateOffset(SUnit *SUI, SUnit *SUJ) {
unsigned BPJ, OPJ;
if (!HII->getBaseAndOffsetPosition(MJ, BPJ, OPJ))
return false;
- unsigned Reg = MI.getOperand(BPI).getReg();
+ Register Reg = MI.getOperand(BPI).getReg();
if (Reg != MJ.getOperand(BPJ).getReg())
return false;
// Make sure that the dependences do not restrict adding MI to the packet.
@@ -788,7 +789,7 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
return false;
if (!MO.isReg() || !MO.isDef() || !MO.isImplicit())
continue;
- unsigned R = MO.getReg();
+ Register R = MO.getReg();
if (R == DepReg || HRI->isSuperRegister(DepReg, R))
return false;
}
@@ -1208,7 +1209,7 @@ bool HexagonPacketizerList::hasDeadDependence(const MachineInstr &I,
for (auto &MO : J.operands()) {
if (!MO.isReg() || !MO.isDef() || !MO.isDead())
continue;
- unsigned R = MO.getReg();
+ Register R = MO.getReg();
if (R != Hexagon::USR_OVF && DeadDefs[R])
return true;
}
@@ -1585,7 +1586,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
// subset of the volatile register set.
for (const MachineOperand &Op : I.operands()) {
if (Op.isReg() && Op.isDef()) {
- unsigned R = Op.getReg();
+ Register R = Op.getReg();
if (!J.readsRegister(R, HRI) && !J.modifiesRegister(R, HRI))
continue;
} else if (!Op.isRegMask()) {
@@ -1763,6 +1764,16 @@ HexagonPacketizerList::addToPacket(MachineInstr &MI) {
void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
MachineBasicBlock::iterator EndMI) {
// Replace VLIWPacketizerList::endPacket(MBB, EndMI).
+ LLVM_DEBUG({
+ if (!CurrentPacketMIs.empty()) {
+ dbgs() << "Finalizing packet:\n";
+ unsigned Idx = 0;
+ for (MachineInstr *MI : CurrentPacketMIs) {
+ unsigned R = ResourceTracker->getUsedResources(Idx++);
+ dbgs() << " * [res:0x" << utohexstr(R) << "] " << *MI;
+ }
+ }
+ });
bool memShufDisabled = getmemShufDisabled();
if (memShufDisabled && !foundLSInPacket()) {
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index daa86b6f5393..943b9ac7ecc4 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -69,8 +69,7 @@ private:
public:
HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
- AliasAnalysis *AA,
- const MachineBranchProbabilityInfo *MBPI,
+ AAResults *AA, const MachineBranchProbabilityInfo *MBPI,
bool Minimal);
// initPacketizerState - initialize some internal flags.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 7c0770926abe..75cb398d4097 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -201,9 +201,7 @@ public:
bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target) override {
- MCFixupKind Kind = Fixup.getKind();
-
- switch((unsigned)Kind) {
+ switch(Fixup.getTargetKind()) {
default:
llvm_unreachable("Unknown Fixup Kind!");
@@ -583,7 +581,7 @@ public:
return false;
// If we cannot resolve the fixup value, it requires relaxation.
if (!Resolved) {
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
case fixup_Hexagon_B22_PCREL:
// GetFixupCount assumes B22 won't relax
LLVM_FALLTHROUGH;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
index f678bf49322e..cdbeae38b3a1 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -44,7 +44,7 @@ unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx,
MCFixup const &Fixup,
bool IsPCRel) const {
MCSymbolRefExpr::VariantKind Variant = Target.getAccessVariant();
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default:
report_fatal_error("Unrecognized relocation type");
break;
@@ -299,5 +299,5 @@ unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx,
std::unique_ptr<MCObjectTargetWriter>
llvm::createHexagonELFObjectWriter(uint8_t OSABI, StringRef CPU) {
- return llvm::make_unique<HexagonELFObjectWriter>(OSABI, CPU);
+ return std::make_unique<HexagonELFObjectWriter>(OSABI, CPU);
}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index fcd3758600c1..8b262bd0248e 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -726,9 +726,6 @@ void HexagonMCChecker::reportNote(SMLoc Loc, llvm::Twine const &Msg) {
}
void HexagonMCChecker::reportWarning(Twine const &Msg) {
- if (ReportErrors) {
- auto SM = Context.getSourceManager();
- if (SM)
- SM->PrintMessage(MCB.getLoc(), SourceMgr::DK_Warning, Msg);
- }
+ if (ReportErrors)
+ Context.reportWarning(MCB.getLoc(), Msg);
}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index f2432883af6f..a799f7f7c0b9 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -116,8 +116,8 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
}
// Update the maximum alignment of the section if necessary.
- if (ByteAlignment > Section.getAlignment())
- Section.setAlignment(ByteAlignment);
+ if (Align(ByteAlignment) > Section.getAlignment())
+ Section.setAlignment(Align(ByteAlignment));
SwitchSection(P.first, P.second);
} else {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 9c50b25156c3..870ab9e94a63 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -72,7 +72,6 @@ cl::opt<bool> MV65("mv65", cl::Hidden, cl::desc("Build for Hexagon V65"),
cl::init(false));
cl::opt<bool> MV66("mv66", cl::Hidden, cl::desc("Build for Hexagon V66"),
cl::init(false));
-} // namespace
cl::opt<Hexagon::ArchEnum>
EnableHVX("mhvx",
@@ -86,6 +85,7 @@ cl::opt<Hexagon::ArchEnum>
clEnumValN(Hexagon::ArchEnum::Generic, "", "")),
// Sentinel for flag not present.
cl::init(Hexagon::ArchEnum::NoArch), cl::ValueOptional);
+} // namespace
static cl::opt<bool>
DisableHVX("mno-hvx", cl::Hidden,
@@ -264,14 +264,12 @@ createHexagonObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
}
static void LLVM_ATTRIBUTE_UNUSED clearFeature(MCSubtargetInfo* STI, uint64_t F) {
- uint64_t FB = STI->getFeatureBits().to_ullong();
- if (FB & (1ULL << F))
+ if (STI->getFeatureBits()[F])
STI->ToggleFeature(F);
}
static bool LLVM_ATTRIBUTE_UNUSED checkFeature(MCSubtargetInfo* STI, uint64_t F) {
- uint64_t FB = STI->getFeatureBits().to_ullong();
- return (FB & (1ULL << F)) != 0;
+ return STI->getFeatureBits()[F];
}
namespace {
@@ -398,7 +396,7 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
MCSubtargetInfo *X = createHexagonMCSubtargetInfoImpl(TT, CPUName, ArchFS);
if (HexagonDisableDuplex) {
llvm::FeatureBitset Features = X->getFeatureBits();
- X->setFeatureBits(Features.set(Hexagon::FeatureDuplex, false));
+ X->setFeatureBits(Features.reset(Hexagon::FeatureDuplex));
}
X->setFeatureBits(completeHVXFeatures(X->getFeatureBits()));
diff --git a/lib/Target/Hexagon/RDFCopy.cpp b/lib/Target/Hexagon/RDFCopy.cpp
index 7702024f87bd..a9d39fd4b2dc 100644
--- a/lib/Target/Hexagon/RDFCopy.cpp
+++ b/lib/Target/Hexagon/RDFCopy.cpp
@@ -45,8 +45,8 @@ bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
const MachineOperand &Src = MI->getOperand(1);
RegisterRef DstR = DFG.makeRegRef(Dst.getReg(), Dst.getSubReg());
RegisterRef SrcR = DFG.makeRegRef(Src.getReg(), Src.getSubReg());
- assert(TargetRegisterInfo::isPhysicalRegister(DstR.Reg));
- assert(TargetRegisterInfo::isPhysicalRegister(SrcR.Reg));
+ assert(Register::isPhysicalRegister(DstR.Reg));
+ assert(Register::isPhysicalRegister(SrcR.Reg));
const TargetRegisterInfo &TRI = DFG.getTRI();
if (TRI.getMinimalPhysRegClass(DstR.Reg) !=
TRI.getMinimalPhysRegClass(SrcR.Reg))
diff --git a/lib/Target/Hexagon/RDFDeadCode.cpp b/lib/Target/Hexagon/RDFDeadCode.cpp
index 52178931aa6d..af86c7b1956b 100644
--- a/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -16,6 +16,7 @@
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
#include <queue>
diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp
index 9d8f706b8a0f..0cb35dc98819 100644
--- a/lib/Target/Hexagon/RDFGraph.cpp
+++ b/lib/Target/Hexagon/RDFGraph.cpp
@@ -633,7 +633,7 @@ bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
// uses or defs, and those lists do not allow sub-registers.
if (Op.getSubReg() != 0)
return false;
- RegisterId Reg = Op.getReg();
+ Register Reg = Op.getReg();
const MCPhysReg *ImpR = Op.isDef() ? D.getImplicitDefs()
: D.getImplicitUses();
if (!ImpR)
@@ -963,7 +963,7 @@ void DataFlowGraph::build(unsigned Options) {
RegisterRef DataFlowGraph::makeRegRef(unsigned Reg, unsigned Sub) const {
assert(PhysicalRegisterInfo::isRegMaskId(Reg) ||
- TargetRegisterInfo::isPhysicalRegister(Reg));
+ Register::isPhysicalRegister(Reg));
assert(Reg != 0);
if (Sub != 0)
Reg = TRI.getSubReg(Reg, Sub);
@@ -1291,8 +1291,8 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
MachineOperand &Op = In.getOperand(OpN);
if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
continue;
- unsigned R = Op.getReg();
- if (!R || !TargetRegisterInfo::isPhysicalRegister(R))
+ Register R = Op.getReg();
+ if (!R || !Register::isPhysicalRegister(R))
continue;
uint16_t Flags = NodeAttrs::None;
if (TOI.isPreserving(In, OpN)) {
@@ -1336,8 +1336,8 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
MachineOperand &Op = In.getOperand(OpN);
if (!Op.isReg() || !Op.isDef() || !Op.isImplicit())
continue;
- unsigned R = Op.getReg();
- if (!R || !TargetRegisterInfo::isPhysicalRegister(R) || DoneDefs.test(R))
+ Register R = Op.getReg();
+ if (!R || !Register::isPhysicalRegister(R) || DoneDefs.test(R))
continue;
RegisterRef RR = makeRegRef(Op);
uint16_t Flags = NodeAttrs::None;
@@ -1365,8 +1365,8 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
MachineOperand &Op = In.getOperand(OpN);
if (!Op.isReg() || !Op.isUse())
continue;
- unsigned R = Op.getReg();
- if (!R || !TargetRegisterInfo::isPhysicalRegister(R))
+ Register R = Op.getReg();
+ if (!R || !Register::isPhysicalRegister(R))
continue;
uint16_t Flags = NodeAttrs::None;
if (Op.isUndef())
diff --git a/lib/Target/Hexagon/RDFLiveness.cpp b/lib/Target/Hexagon/RDFLiveness.cpp
index 9cd304aa10bc..7d7b89462ff9 100644
--- a/lib/Target/Hexagon/RDFLiveness.cpp
+++ b/lib/Target/Hexagon/RDFLiveness.cpp
@@ -889,8 +889,8 @@ void Liveness::resetKills(MachineBasicBlock *B) {
// implicit defs.
if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
continue;
- unsigned R = Op.getReg();
- if (!TargetRegisterInfo::isPhysicalRegister(R))
+ Register R = Op.getReg();
+ if (!Register::isPhysicalRegister(R))
continue;
for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
Live.reset(*SR);
@@ -898,8 +898,8 @@ void Liveness::resetKills(MachineBasicBlock *B) {
for (auto &Op : MI->operands()) {
if (!Op.isReg() || !Op.isUse() || Op.isUndef())
continue;
- unsigned R = Op.getReg();
- if (!TargetRegisterInfo::isPhysicalRegister(R))
+ Register R = Op.getReg();
+ if (!Register::isPhysicalRegister(R))
continue;
bool IsLive = false;
for (MCRegAliasIterator AR(R, &TRI, true); AR.isValid(); ++AR) {
diff --git a/lib/Target/Hexagon/RDFRegisters.cpp b/lib/Target/Hexagon/RDFRegisters.cpp
index 6e0f33695f0e..b5675784e34b 100644
--- a/lib/Target/Hexagon/RDFRegisters.cpp
+++ b/lib/Target/Hexagon/RDFRegisters.cpp
@@ -101,7 +101,7 @@ RegisterRef PhysicalRegisterInfo::normalize(RegisterRef RR) const {
std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const {
// Do not include RR in the alias set.
std::set<RegisterId> AS;
- assert(isRegMaskId(Reg) || TargetRegisterInfo::isPhysicalRegister(Reg));
+ assert(isRegMaskId(Reg) || Register::isPhysicalRegister(Reg));
if (isRegMaskId(Reg)) {
// XXX SLOW
const uint32_t *MB = getRegMaskBits(Reg);
@@ -129,8 +129,8 @@ std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const {
}
bool PhysicalRegisterInfo::aliasRR(RegisterRef RA, RegisterRef RB) const {
- assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg));
- assert(TargetRegisterInfo::isPhysicalRegister(RB.Reg));
+ assert(Register::isPhysicalRegister(RA.Reg));
+ assert(Register::isPhysicalRegister(RB.Reg));
MCRegUnitMaskIterator UMA(RA.Reg, &TRI);
MCRegUnitMaskIterator UMB(RB.Reg, &TRI);
@@ -160,7 +160,7 @@ bool PhysicalRegisterInfo::aliasRR(RegisterRef RA, RegisterRef RB) const {
}
bool PhysicalRegisterInfo::aliasRM(RegisterRef RR, RegisterRef RM) const {
- assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg) && isRegMaskId(RM.Reg));
+ assert(Register::isPhysicalRegister(RR.Reg) && isRegMaskId(RM.Reg));
const uint32_t *MB = getRegMaskBits(RM.Reg);
bool Preserved = MB[RR.Reg/32] & (1u << (RR.Reg%32));
// If the lane mask information is "full", e.g. when the given lane mask
diff --git a/lib/Target/Hexagon/RDFRegisters.h b/lib/Target/Hexagon/RDFRegisters.h
index 646233bacda5..4afaf80e4659 100644
--- a/lib/Target/Hexagon/RDFRegisters.h
+++ b/lib/Target/Hexagon/RDFRegisters.h
@@ -99,15 +99,15 @@ namespace rdf {
const MachineFunction &mf);
static bool isRegMaskId(RegisterId R) {
- return TargetRegisterInfo::isStackSlot(R);
+ return Register::isStackSlot(R);
}
RegisterId getRegMaskId(const uint32_t *RM) const {
- return TargetRegisterInfo::index2StackSlot(RegMasks.find(RM));
+ return Register::index2StackSlot(RegMasks.find(RM));
}
const uint32_t *getRegMaskBits(RegisterId R) const {
- return RegMasks.get(TargetRegisterInfo::stackSlot2Index(R));
+ return RegMasks.get(Register::stackSlot2Index(R));
}
RegisterRef normalize(RegisterRef RR) const;
@@ -125,7 +125,7 @@ namespace rdf {
}
const BitVector &getMaskUnits(RegisterId MaskId) const {
- return MaskInfos[TargetRegisterInfo::stackSlot2Index(MaskId)].Units;
+ return MaskInfos[Register::stackSlot2Index(MaskId)].Units;
}
RegisterRef mapTo(RegisterRef RR, unsigned R) const;
diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 9af8a0b35b2f..ec82e3a41f2a 100644
--- a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -469,13 +469,14 @@ public:
else if (isa<LanaiMCExpr>(getImm())) {
#ifndef NDEBUG
const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
- assert(SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO);
+ assert(SymbolRefExpr &&
+ SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO);
#endif
Inst.addOperand(MCOperand::createExpr(getImm()));
} else if (isa<MCBinaryExpr>(getImm())) {
#ifndef NDEBUG
const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
- assert(isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+ assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
LanaiMCExpr::VK_Lanai_ABS_LO);
#endif
@@ -499,13 +500,14 @@ public:
else if (isa<LanaiMCExpr>(getImm())) {
#ifndef NDEBUG
const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
- assert(SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI);
+ assert(SymbolRefExpr &&
+ SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI);
#endif
Inst.addOperand(MCOperand::createExpr(getImm()));
} else if (isa<MCBinaryExpr>(getImm())) {
#ifndef NDEBUG
const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
- assert(isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+ assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
LanaiMCExpr::VK_Lanai_ABS_HI);
#endif
@@ -544,10 +546,9 @@ public:
} else if (isa<MCBinaryExpr>(getImm())) {
#ifndef NDEBUG
const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
- const LanaiMCExpr *SymbolRefExpr =
- dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS());
- assert(SymbolRefExpr &&
- SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None);
+ assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+ cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
+ LanaiMCExpr::VK_Lanai_None);
#endif
Inst.addOperand(MCOperand::createExpr(getImm()));
} else
@@ -580,7 +581,7 @@ public:
}
static std::unique_ptr<LanaiOperand> CreateToken(StringRef Str, SMLoc Start) {
- auto Op = make_unique<LanaiOperand>(TOKEN);
+ auto Op = std::make_unique<LanaiOperand>(TOKEN);
Op->Tok.Data = Str.data();
Op->Tok.Length = Str.size();
Op->StartLoc = Start;
@@ -590,7 +591,7 @@ public:
static std::unique_ptr<LanaiOperand> createReg(unsigned RegNum, SMLoc Start,
SMLoc End) {
- auto Op = make_unique<LanaiOperand>(REGISTER);
+ auto Op = std::make_unique<LanaiOperand>(REGISTER);
Op->Reg.RegNum = RegNum;
Op->StartLoc = Start;
Op->EndLoc = End;
@@ -599,7 +600,7 @@ public:
static std::unique_ptr<LanaiOperand> createImm(const MCExpr *Value,
SMLoc Start, SMLoc End) {
- auto Op = make_unique<LanaiOperand>(IMMEDIATE);
+ auto Op = std::make_unique<LanaiOperand>(IMMEDIATE);
Op->Imm.Value = Value;
Op->StartLoc = Start;
Op->EndLoc = End;
diff --git a/lib/Target/Lanai/LanaiAsmPrinter.cpp b/lib/Target/Lanai/LanaiAsmPrinter.cpp
index 64d963475e1a..12a3202446a8 100644
--- a/lib/Target/Lanai/LanaiAsmPrinter.cpp
+++ b/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -133,7 +133,7 @@ bool LanaiAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
const MachineOperand &MO = MI->getOperand(RegOp);
if (!MO.isReg())
return true;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
O << LanaiInstPrinter::getRegisterName(Reg);
return false;
}
diff --git a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
index 09c63dca23e2..b9e577d201f9 100644
--- a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
+++ b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
-// Simple pass to fills delay slots with useful instructions.
+// Simple pass to fill delay slots with useful instructions.
//
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Lanai/LanaiFrameLowering.cpp b/lib/Target/Lanai/LanaiFrameLowering.cpp
index 142c09c504cc..eddc2b8e61f7 100644
--- a/lib/Target/Lanai/LanaiFrameLowering.cpp
+++ b/lib/Target/Lanai/LanaiFrameLowering.cpp
@@ -72,8 +72,8 @@ void LanaiFrameLowering::replaceAdjDynAllocPseudo(MachineFunction &MF) const {
MachineInstr &MI = *MBBI++;
if (MI.getOpcode() == Lanai::ADJDYNALLOC) {
DebugLoc DL = MI.getDebugLoc();
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
BuildMI(*MBB, MI, DL, LII.get(Lanai::ADD_I_LO), Dst)
.addReg(Src)
diff --git a/lib/Target/Lanai/LanaiFrameLowering.h b/lib/Target/Lanai/LanaiFrameLowering.h
index 5fe4535543ec..380d63df7301 100644
--- a/lib/Target/Lanai/LanaiFrameLowering.h
+++ b/lib/Target/Lanai/LanaiFrameLowering.h
@@ -31,7 +31,7 @@ protected:
public:
explicit LanaiFrameLowering(const LanaiSubtarget &Subtarget)
: TargetFrameLowering(StackGrowsDown,
- /*StackAlignment=*/8,
+ /*StackAlignment=*/Align(8),
/*LocalAreaOffset=*/0),
STI(Subtarget) {}
diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp
index 1ed078bb433f..43933d062a7e 100644
--- a/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -144,9 +144,9 @@ LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::XOR);
- // Function alignments (log2)
- setMinFunctionAlignment(2);
- setPrefFunctionAlignment(2);
+ // Function alignments
+ setMinFunctionAlignment(Align(4));
+ setPrefFunctionAlignment(Align(4));
setJumpIsExpensive(true);
@@ -212,10 +212,11 @@ SDValue LanaiTargetLowering::LowerOperation(SDValue Op,
// Lanai Inline Assembly Support
//===----------------------------------------------------------------------===//
-unsigned LanaiTargetLowering::getRegisterByName(const char *RegName, EVT /*VT*/,
- SelectionDAG & /*DAG*/) const {
+Register LanaiTargetLowering::getRegisterByName(
+ const char *RegName, EVT /*VT*/,
+ const MachineFunction & /*MF*/) const {
// Only unallocatable registers should be matched here.
- unsigned Reg = StringSwitch<unsigned>(RegName)
+ Register Reg = StringSwitch<unsigned>(RegName)
.Case("pc", Lanai::PC)
.Case("sp", Lanai::SP)
.Case("fp", Lanai::FP)
@@ -459,7 +460,7 @@ SDValue LanaiTargetLowering::LowerCCCArguments(
EVT RegVT = VA.getLocVT();
switch (RegVT.getSimpleVT().SimpleTy) {
case MVT::i32: {
- unsigned VReg = RegInfo.createVirtualRegister(&Lanai::GPRRegClass);
+ Register VReg = RegInfo.createVirtualRegister(&Lanai::GPRRegClass);
RegInfo.addLiveIn(VA.getLocReg(), VReg);
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h
index e7b5755e9041..4c35a2c6fb8e 100644
--- a/lib/Target/Lanai/LanaiISelLowering.h
+++ b/lib/Target/Lanai/LanaiISelLowering.h
@@ -90,8 +90,8 @@ public:
SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
- unsigned getRegisterByName(const char *RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char *RegName, EVT VT,
+ const MachineFunction &MF) const override;
std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
diff --git a/lib/Target/Lanai/LanaiInstrInfo.cpp b/lib/Target/Lanai/LanaiInstrInfo.cpp
index 700a86069102..b950fd0424ef 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -86,8 +86,7 @@ void LanaiInstrInfo::loadRegFromStackSlot(
}
bool LanaiInstrInfo::areMemAccessesTriviallyDisjoint(
- const MachineInstr &MIa, const MachineInstr &MIb,
- AliasAnalysis * /*AA*/) const {
+ const MachineInstr &MIa, const MachineInstr &MIb) const {
assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
@@ -457,7 +456,7 @@ bool LanaiInstrInfo::analyzeSelect(const MachineInstr &MI,
// return the defining instruction.
static MachineInstr *canFoldIntoSelect(unsigned Reg,
const MachineRegisterInfo &MRI) {
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
return nullptr;
if (!MRI.hasOneNonDBGUse(Reg))
return nullptr;
@@ -479,7 +478,7 @@ static MachineInstr *canFoldIntoSelect(unsigned Reg,
// MI can't have any tied operands, that would conflict with predication.
if (MO.isTied())
return nullptr;
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ if (Register::isPhysicalRegister(MO.getReg()))
return nullptr;
if (MO.isDef() && !MO.isDead())
return nullptr;
@@ -505,7 +504,7 @@ LanaiInstrInfo::optimizeSelect(MachineInstr &MI,
// Find new register class to use.
MachineOperand FalseReg = MI.getOperand(Invert ? 1 : 2);
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
if (!MRI.constrainRegClass(DestReg, PreviousClass))
return nullptr;
diff --git a/lib/Target/Lanai/LanaiInstrInfo.h b/lib/Target/Lanai/LanaiInstrInfo.h
index d71424aeb0b1..59a04d2cc388 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/lib/Target/Lanai/LanaiInstrInfo.h
@@ -36,8 +36,7 @@ public:
}
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
- const MachineInstr &MIb,
- AliasAnalysis *AA) const override;
+ const MachineInstr &MIb) const override;
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
diff --git a/lib/Target/Lanai/LanaiRegisterInfo.cpp b/lib/Target/Lanai/LanaiRegisterInfo.cpp
index d3056a1eba8e..7c28debb94dd 100644
--- a/lib/Target/Lanai/LanaiRegisterInfo.cpp
+++ b/lib/Target/Lanai/LanaiRegisterInfo.cpp
@@ -155,7 +155,7 @@ void LanaiRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
if (!HasFP || (needsStackRealignment(MF) && FrameIndex >= 0))
Offset += MF.getFrameInfo().getStackSize();
- unsigned FrameReg = getFrameRegister(MF);
+ Register FrameReg = getFrameRegister(MF);
if (FrameIndex >= 0) {
if (hasBasePointer(MF))
FrameReg = getBaseRegister();
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
index 4313fa5a82b5..919d43ad9b9b 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
@@ -88,5 +88,5 @@ bool LanaiELFObjectWriter::needsRelocateWithSymbol(const MCSymbol & /*SD*/,
std::unique_ptr<MCObjectTargetWriter>
llvm::createLanaiELFObjectWriter(uint8_t OSABI) {
- return llvm::make_unique<LanaiELFObjectWriter>(OSABI);
+ return std::make_unique<LanaiELFObjectWriter>(OSABI);
}
diff --git a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index a0ec14ae2381..85dcc0f152f9 100644
--- a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -191,33 +191,33 @@ public:
}
static std::unique_ptr<MSP430Operand> CreateToken(StringRef Str, SMLoc S) {
- return make_unique<MSP430Operand>(Str, S);
+ return std::make_unique<MSP430Operand>(Str, S);
}
static std::unique_ptr<MSP430Operand> CreateReg(unsigned RegNum, SMLoc S,
SMLoc E) {
- return make_unique<MSP430Operand>(k_Reg, RegNum, S, E);
+ return std::make_unique<MSP430Operand>(k_Reg, RegNum, S, E);
}
static std::unique_ptr<MSP430Operand> CreateImm(const MCExpr *Val, SMLoc S,
SMLoc E) {
- return make_unique<MSP430Operand>(Val, S, E);
+ return std::make_unique<MSP430Operand>(Val, S, E);
}
static std::unique_ptr<MSP430Operand> CreateMem(unsigned RegNum,
const MCExpr *Val,
SMLoc S, SMLoc E) {
- return make_unique<MSP430Operand>(RegNum, Val, S, E);
+ return std::make_unique<MSP430Operand>(RegNum, Val, S, E);
}
static std::unique_ptr<MSP430Operand> CreateIndReg(unsigned RegNum, SMLoc S,
SMLoc E) {
- return make_unique<MSP430Operand>(k_IndReg, RegNum, S, E);
+ return std::make_unique<MSP430Operand>(k_IndReg, RegNum, S, E);
}
static std::unique_ptr<MSP430Operand> CreatePostIndReg(unsigned RegNum, SMLoc S,
SMLoc E) {
- return make_unique<MSP430Operand>(k_PostIndReg, RegNum, S, E);
+ return std::make_unique<MSP430Operand>(k_PostIndReg, RegNum, S, E);
}
SMLoc getStartLoc() const { return Start; }
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
index 38b7da32c246..0cdd1f4f701f 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
@@ -31,7 +31,7 @@ protected:
unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
const MCFixup &Fixup, bool IsPCRel) const override {
// Translate fixup kind to ELF relocation type.
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
case FK_Data_1: return ELF::R_MSP430_8;
case FK_Data_2: return ELF::R_MSP430_16_BYTE;
case FK_Data_4: return ELF::R_MSP430_32;
@@ -54,5 +54,5 @@ protected:
std::unique_ptr<MCObjectTargetWriter>
llvm::createMSP430ELFObjectWriter(uint8_t OSABI) {
- return llvm::make_unique<MSP430ELFObjectWriter>(OSABI);
+ return std::make_unique<MSP430ELFObjectWriter>(OSABI);
}
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 3a71a084d1af..a3b91acdc6d0 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -159,8 +159,9 @@ void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
void MSP430AsmPrinter::EmitInterruptVectorSection(MachineFunction &ISR) {
MCSection *Cur = OutStreamer->getCurrentSectionOnly();
const auto *F = &ISR.getFunction();
- assert(F->hasFnAttribute("interrupt") &&
- "Functions with MSP430_INTR CC should have 'interrupt' attribute");
+ if (F->getCallingConv() != CallingConv::MSP430_INTR) {
+ report_fatal_error("Functions with 'interrupt' attribute must have msp430_intrcc CC");
+ }
StringRef IVIdx = F->getFnAttribute("interrupt").getValueAsString();
MCSection *IV = OutStreamer->getContext().getELFSection(
"__interrupt_vector_" + IVIdx,
@@ -174,8 +175,9 @@ void MSP430AsmPrinter::EmitInterruptVectorSection(MachineFunction &ISR) {
bool MSP430AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// Emit separate section for an interrupt vector if ISR
- if (MF.getFunction().getCallingConv() == CallingConv::MSP430_INTR)
+ if (MF.getFunction().hasFnAttribute("interrupt")) {
EmitInterruptVectorSection(MF);
+ }
SetupMachineFunction(MF);
EmitFunctionBody();
diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp
index 45e7c26e4d30..ce5affdc25b0 100644
--- a/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -20,6 +20,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index 33ce3c70a2a3..70e284053021 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -22,7 +22,8 @@ protected:
public:
explicit MSP430FrameLowering()
- : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2) {}
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(2), -2,
+ Align(2)) {}
/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
/// the function.
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index fedfb857bd0f..64169d1f5eb1 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -327,8 +327,8 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN);
// TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll
- setMinFunctionAlignment(1);
- setPrefFunctionAlignment(1);
+ setMinFunctionAlignment(Align(2));
+ setPrefFunctionAlignment(Align(2));
}
SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
@@ -353,6 +353,9 @@ SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
}
}
+unsigned MSP430TargetLowering::getShiftAmountThreshold(EVT VT) const {
+ return 2;
+}
//===----------------------------------------------------------------------===//
// MSP430 Inline Assembly Support
//===----------------------------------------------------------------------===//
@@ -632,7 +635,7 @@ SDValue MSP430TargetLowering::LowerCCCArguments(
llvm_unreachable(nullptr);
}
case MVT::i16:
- unsigned VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass);
+ Register VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass);
RegInfo.addLiveIn(VA.getLocReg(), VReg);
SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
@@ -1446,8 +1449,8 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
case MSP430::Rrcl16: {
BuildMI(*BB, MI, dl, TII.get(MSP430::BIC16rc), MSP430::SR)
.addReg(MSP430::SR).addImm(1);
- unsigned SrcReg = MI.getOperand(1).getReg();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
unsigned RrcOpc = MI.getOpcode() == MSP430::Rrcl16
? MSP430::RRC16r : MSP430::RRC8r;
BuildMI(*BB, MI, dl, TII.get(RrcOpc), DstReg)
@@ -1479,13 +1482,13 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
LoopBB->addSuccessor(RemBB);
LoopBB->addSuccessor(LoopBB);
- unsigned ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass);
- unsigned ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass);
- unsigned ShiftReg = RI.createVirtualRegister(RC);
- unsigned ShiftReg2 = RI.createVirtualRegister(RC);
- unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass);
+ Register ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass);
+ Register ShiftReg = RI.createVirtualRegister(RC);
+ Register ShiftReg2 = RI.createVirtualRegister(RC);
+ Register ShiftAmtSrcReg = MI.getOperand(2).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
// BB:
// cmp 0, N
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index ee6b6316d7a9..9224e5e3d005 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -124,6 +124,8 @@ namespace llvm {
bool isZExtFree(EVT VT1, EVT VT2) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
+ unsigned getShiftAmountThreshold(EVT VT) const override;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const override;
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index afbb2f213b45..bec357a1548d 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -139,7 +139,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
return;
// We need to materialize the offset via add instruction.
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
if (Offset < 0)
BuildMI(MBB, std::next(II), dl, TII.get(MSP430::SUB16ri), DstReg)
.addReg(DstReg).addImm(-Offset);
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 8c4ca982c966..e9aeba76de85 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -46,7 +46,7 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
: LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS,
Options, getEffectiveRelocModel(RM),
getEffectiveCodeModel(CM, CodeModel::Small), OL),
- TLOF(make_unique<TargetLoweringObjectFileELF>()),
+ TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
Subtarget(TT, CPU, FS, *this) {
initAsmInfo();
}
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 1f7d095bf49b..21d0df74d458 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -39,6 +39,7 @@
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCValue.h"
#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Alignment.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
@@ -233,9 +234,14 @@ class MipsAsmParser : public MCTargetAsmParser {
bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
MCStreamer &Out, const MCSubtargetInfo *STI);
- bool expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR, bool Is64FPU,
- SMLoc IDLoc, MCStreamer &Out,
- const MCSubtargetInfo *STI);
+ bool expandLoadSingleImmToGPR(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+ bool expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+ bool expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI);
+ bool expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU, SMLoc IDLoc,
+ MCStreamer &Out, const MCSubtargetInfo *STI);
bool expandLoadAddress(unsigned DstReg, unsigned BaseReg,
const MCOperand &Offset, bool Is32BitAddress,
@@ -512,11 +518,11 @@ public:
// Remember the initial assembler options. The user can not modify these.
AssemblerOptions.push_back(
- llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
+ std::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
// Create an assembler options environment for the user to modify.
AssemblerOptions.push_back(
- llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
+ std::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
getTargetStreamer().updateABIInfo(*this);
@@ -844,7 +850,7 @@ private:
const MCRegisterInfo *RegInfo,
SMLoc S, SMLoc E,
MipsAsmParser &Parser) {
- auto Op = llvm::make_unique<MipsOperand>(k_RegisterIndex, Parser);
+ auto Op = std::make_unique<MipsOperand>(k_RegisterIndex, Parser);
Op->RegIdx.Index = Index;
Op->RegIdx.RegInfo = RegInfo;
Op->RegIdx.Kind = RegKind;
@@ -1446,7 +1452,7 @@ public:
static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S,
MipsAsmParser &Parser) {
- auto Op = llvm::make_unique<MipsOperand>(k_Token, Parser);
+ auto Op = std::make_unique<MipsOperand>(k_Token, Parser);
Op->Tok.Data = Str.data();
Op->Tok.Length = Str.size();
Op->StartLoc = S;
@@ -1521,7 +1527,7 @@ public:
static std::unique_ptr<MipsOperand>
CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) {
- auto Op = llvm::make_unique<MipsOperand>(k_Immediate, Parser);
+ auto Op = std::make_unique<MipsOperand>(k_Immediate, Parser);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -1531,7 +1537,7 @@ public:
static std::unique_ptr<MipsOperand>
CreateMem(std::unique_ptr<MipsOperand> Base, const MCExpr *Off, SMLoc S,
SMLoc E, MipsAsmParser &Parser) {
- auto Op = llvm::make_unique<MipsOperand>(k_Memory, Parser);
+ auto Op = std::make_unique<MipsOperand>(k_Memory, Parser);
Op->Mem.Base = Base.release();
Op->Mem.Off = Off;
Op->StartLoc = S;
@@ -1544,7 +1550,7 @@ public:
MipsAsmParser &Parser) {
assert(Regs.size() > 0 && "Empty list not allowed");
- auto Op = llvm::make_unique<MipsOperand>(k_RegList, Parser);
+ auto Op = std::make_unique<MipsOperand>(k_RegList, Parser);
Op->RegList.List = new SmallVector<unsigned, 10>(Regs.begin(), Regs.end());
Op->StartLoc = StartLoc;
Op->EndLoc = EndLoc;
@@ -1804,8 +1810,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
break; // We'll deal with this situation later on when applying fixups.
if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
return Error(IDLoc, "branch target out of range");
- if (OffsetToAlignment(Offset.getImm(),
- 1LL << (inMicroMipsMode() ? 1 : 2)))
+ if (offsetToAlignment(Offset.getImm(),
+ (inMicroMipsMode() ? Align(2) : Align(4))))
return Error(IDLoc, "branch to misaligned address");
break;
case Mips::BGEZ:
@@ -1834,8 +1840,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
break; // We'll deal with this situation later on when applying fixups.
if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
return Error(IDLoc, "branch target out of range");
- if (OffsetToAlignment(Offset.getImm(),
- 1LL << (inMicroMipsMode() ? 1 : 2)))
+ if (offsetToAlignment(Offset.getImm(),
+ (inMicroMipsMode() ? Align(2) : Align(4))))
return Error(IDLoc, "branch to misaligned address");
break;
case Mips::BGEC: case Mips::BGEC_MMR6:
@@ -1850,7 +1856,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
break; // We'll deal with this situation later on when applying fixups.
if (!isIntN(18, Offset.getImm()))
return Error(IDLoc, "branch target out of range");
- if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
+ if (offsetToAlignment(Offset.getImm(), Align(4)))
return Error(IDLoc, "branch to misaligned address");
break;
case Mips::BLEZC: case Mips::BLEZC_MMR6:
@@ -1863,7 +1869,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
break; // We'll deal with this situation later on when applying fixups.
if (!isIntN(18, Offset.getImm()))
return Error(IDLoc, "branch target out of range");
- if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
+ if (offsetToAlignment(Offset.getImm(), Align(4)))
return Error(IDLoc, "branch to misaligned address");
break;
case Mips::BEQZC: case Mips::BEQZC_MMR6:
@@ -1874,7 +1880,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
break; // We'll deal with this situation later on when applying fixups.
if (!isIntN(23, Offset.getImm()))
return Error(IDLoc, "branch target out of range");
- if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
+ if (offsetToAlignment(Offset.getImm(), Align(4)))
return Error(IDLoc, "branch to misaligned address");
break;
case Mips::BEQZ16_MM:
@@ -1887,7 +1893,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
break; // We'll deal with this situation later on when applying fixups.
if (!isInt<8>(Offset.getImm()))
return Error(IDLoc, "branch target out of range");
- if (OffsetToAlignment(Offset.getImm(), 2LL))
+ if (offsetToAlignment(Offset.getImm(), Align(2)))
return Error(IDLoc, "branch to misaligned address");
break;
}
@@ -2454,25 +2460,21 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
: MER_Success;
case Mips::LoadImmSingleGPR:
- return expandLoadImmReal(Inst, true, true, false, IDLoc, Out, STI)
- ? MER_Fail
- : MER_Success;
+ return expandLoadSingleImmToGPR(Inst, IDLoc, Out, STI) ? MER_Fail
+ : MER_Success;
case Mips::LoadImmSingleFGR:
- return expandLoadImmReal(Inst, true, false, false, IDLoc, Out, STI)
- ? MER_Fail
- : MER_Success;
+ return expandLoadSingleImmToFPR(Inst, IDLoc, Out, STI) ? MER_Fail
+ : MER_Success;
case Mips::LoadImmDoubleGPR:
- return expandLoadImmReal(Inst, false, true, false, IDLoc, Out, STI)
- ? MER_Fail
- : MER_Success;
+ return expandLoadDoubleImmToGPR(Inst, IDLoc, Out, STI) ? MER_Fail
+ : MER_Success;
case Mips::LoadImmDoubleFGR:
- return expandLoadImmReal(Inst, false, false, true, IDLoc, Out, STI)
- ? MER_Fail
- : MER_Success;
+ return expandLoadDoubleImmToFPR(Inst, true, IDLoc, Out, STI) ? MER_Fail
+ : MER_Success;
case Mips::LoadImmDoubleFGR_32:
- return expandLoadImmReal(Inst, false, false, false, IDLoc, Out, STI)
- ? MER_Fail
- : MER_Success;
+ return expandLoadDoubleImmToFPR(Inst, false, IDLoc, Out, STI) ? MER_Fail
+ : MER_Success;
+
case Mips::Ulh:
return expandUlh(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success;
case Mips::Ulhu:
@@ -2868,12 +2870,12 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
bool Is32BitSym, SMLoc IDLoc,
MCStreamer &Out,
const MCSubtargetInfo *STI) {
- // FIXME: These expansions do not respect -mxgot.
MipsTargetStreamer &TOut = getTargetStreamer();
- bool UseSrcReg = SrcReg != Mips::NoRegister;
+ bool UseSrcReg = SrcReg != Mips::NoRegister && SrcReg != Mips::ZERO &&
+ SrcReg != Mips::ZERO_64;
warnIfNoMacro(IDLoc);
- if (inPicMode() && ABI.IsO32()) {
+ if (inPicMode()) {
MCValue Res;
if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) {
Error(IDLoc, "expected relocatable expression");
@@ -2884,46 +2886,41 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
return true;
}
+ bool IsPtr64 = ABI.ArePtrs64bit();
+ bool IsLocalSym =
+ Res.getSymA()->getSymbol().isInSection() ||
+ Res.getSymA()->getSymbol().isTemporary() ||
+ (Res.getSymA()->getSymbol().isELF() &&
+ cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
+ ELF::STB_LOCAL);
+ bool UseXGOT = STI->getFeatureBits()[Mips::FeatureXGOT] && !IsLocalSym;
+
// The case where the result register is $25 is somewhat special. If the
// symbol in the final relocation is external and not modified with a
- // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16.
+ // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16
+ // or R_MIPS_CALL16 instead of R_MIPS_GOT_DISP in 64-bit case.
if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
- Res.getConstant() == 0 &&
- !(Res.getSymA()->getSymbol().isInSection() ||
- Res.getSymA()->getSymbol().isTemporary() ||
- (Res.getSymA()->getSymbol().isELF() &&
- cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
- ELF::STB_LOCAL))) {
- const MCExpr *CallExpr =
- MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
- TOut.emitRRX(Mips::LW, DstReg, GPReg, MCOperand::createExpr(CallExpr),
- IDLoc, STI);
+ Res.getConstant() == 0 && !IsLocalSym) {
+ if (UseXGOT) {
+ const MCExpr *CallHiExpr = MipsMCExpr::create(MipsMCExpr::MEK_CALL_HI16,
+ SymExpr, getContext());
+ const MCExpr *CallLoExpr = MipsMCExpr::create(MipsMCExpr::MEK_CALL_LO16,
+ SymExpr, getContext());
+ TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(CallHiExpr), IDLoc,
+ STI);
+ TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, DstReg, GPReg,
+ IDLoc, STI);
+ TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, DstReg, DstReg,
+ MCOperand::createExpr(CallLoExpr), IDLoc, STI);
+ } else {
+ const MCExpr *CallExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
+ TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, DstReg, GPReg,
+ MCOperand::createExpr(CallExpr), IDLoc, STI);
+ }
return false;
}
- // The remaining cases are:
- // External GOT: lw $tmp, %got(symbol+offset)($gp)
- // >addiu $tmp, $tmp, %lo(offset)
- // >addiu $rd, $tmp, $rs
- // Local GOT: lw $tmp, %got(symbol+offset)($gp)
- // addiu $tmp, $tmp, %lo(symbol+offset)($gp)
- // >addiu $rd, $tmp, $rs
- // The addiu's marked with a '>' may be omitted if they are redundant. If
- // this happens then the last instruction must use $rd as the result
- // register.
- const MipsMCExpr *GotExpr =
- MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext());
- const MCExpr *LoExpr = nullptr;
- if (Res.getSymA()->getSymbol().isInSection() ||
- Res.getSymA()->getSymbol().isTemporary())
- LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
- else if (Res.getConstant() != 0) {
- // External symbols fully resolve the symbol with just the %got(symbol)
- // but we must still account for any offset to the symbol for expressions
- // like symbol+8.
- LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
- }
-
unsigned TmpReg = DstReg;
if (UseSrcReg &&
getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
@@ -2936,94 +2933,102 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
TmpReg = ATReg;
}
- TOut.emitRRX(Mips::LW, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc,
- STI);
-
- if (LoExpr)
- TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
+ if (UseXGOT) {
+ // Loading address from XGOT
+ // External GOT: lui $tmp, %got_hi(symbol)($gp)
+ // addu $tmp, $tmp, $gp
+ // lw $tmp, %got_lo(symbol)($tmp)
+ // >addiu $tmp, $tmp, offset
+ // >addiu $rd, $tmp, $rs
+ // The addiu's marked with a '>' may be omitted if they are redundant. If
+ // this happens then the last instruction must use $rd as the result
+ // register.
+ const MCExpr *CallHiExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_GOT_HI16, SymExpr, getContext());
+ const MCExpr *CallLoExpr = MipsMCExpr::create(
+ MipsMCExpr::MEK_GOT_LO16, Res.getSymA(), getContext());
+
+ TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(CallHiExpr), IDLoc,
+ STI);
+ TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, TmpReg, TmpReg, GPReg,
IDLoc, STI);
+ TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, TmpReg, TmpReg,
+ MCOperand::createExpr(CallLoExpr), IDLoc, STI);
- if (UseSrcReg)
- TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
-
- return false;
- }
-
- if (inPicMode() && ABI.ArePtrs64bit()) {
- MCValue Res;
- if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) {
- Error(IDLoc, "expected relocatable expression");
- return true;
- }
- if (Res.getSymB() != nullptr) {
- Error(IDLoc, "expected relocatable expression with only one symbol");
- return true;
- }
+ if (Res.getConstant() != 0)
+ TOut.emitRRX(IsPtr64 ? Mips::DADDiu : Mips::ADDiu, TmpReg, TmpReg,
+ MCOperand::createExpr(MCConstantExpr::create(
+ Res.getConstant(), getContext())),
+ IDLoc, STI);
- // The case where the result register is $25 is somewhat special. If the
- // symbol in the final relocation is external and not modified with a
- // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT_DISP.
- if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
- Res.getConstant() == 0 &&
- !(Res.getSymA()->getSymbol().isInSection() ||
- Res.getSymA()->getSymbol().isTemporary() ||
- (Res.getSymA()->getSymbol().isELF() &&
- cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
- ELF::STB_LOCAL))) {
- const MCExpr *CallExpr =
- MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
- TOut.emitRRX(Mips::LD, DstReg, GPReg, MCOperand::createExpr(CallExpr),
- IDLoc, STI);
+ if (UseSrcReg)
+ TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, TmpReg, SrcReg,
+ IDLoc, STI);
return false;
}
- // The remaining cases are:
- // Small offset: ld $tmp, %got_disp(symbol)($gp)
- // >daddiu $tmp, $tmp, offset
- // >daddu $rd, $tmp, $rs
- // The daddiu's marked with a '>' may be omitted if they are redundant. If
- // this happens then the last instruction must use $rd as the result
- // register.
- const MipsMCExpr *GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP,
- Res.getSymA(),
- getContext());
+ const MipsMCExpr *GotExpr = nullptr;
const MCExpr *LoExpr = nullptr;
- if (Res.getConstant() != 0) {
- // Symbols fully resolve with just the %got_disp(symbol) but we
- // must still account for any offset to the symbol for
- // expressions like symbol+8.
- LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
-
- // FIXME: Offsets greater than 16 bits are not yet implemented.
- // FIXME: The correct range is a 32-bit sign-extended number.
- if (Res.getConstant() < -0x8000 || Res.getConstant() > 0x7fff) {
- Error(IDLoc, "macro instruction uses large offset, which is not "
- "currently supported");
- return true;
+ if (IsPtr64) {
+ // The remaining cases are:
+ // Small offset: ld $tmp, %got_disp(symbol)($gp)
+ // >daddiu $tmp, $tmp, offset
+ // >daddu $rd, $tmp, $rs
+ // The daddiu's marked with a '>' may be omitted if they are redundant. If
+ // this happens then the last instruction must use $rd as the result
+ // register.
+ GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, Res.getSymA(),
+ getContext());
+ if (Res.getConstant() != 0) {
+ // Symbols fully resolve with just the %got_disp(symbol) but we
+ // must still account for any offset to the symbol for
+ // expressions like symbol+8.
+ LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
+
+ // FIXME: Offsets greater than 16 bits are not yet implemented.
+ // FIXME: The correct range is a 32-bit sign-extended number.
+ if (Res.getConstant() < -0x8000 || Res.getConstant() > 0x7fff) {
+ Error(IDLoc, "macro instruction uses large offset, which is not "
+ "currently supported");
+ return true;
+ }
+ }
+ } else {
+ // The remaining cases are:
+ // External GOT: lw $tmp, %got(symbol)($gp)
+ // >addiu $tmp, $tmp, offset
+ // >addiu $rd, $tmp, $rs
+ // Local GOT: lw $tmp, %got(symbol+offset)($gp)
+ // addiu $tmp, $tmp, %lo(symbol+offset)($gp)
+ // >addiu $rd, $tmp, $rs
+ // The addiu's marked with a '>' may be omitted if they are redundant. If
+ // this happens then the last instruction must use $rd as the result
+ // register.
+ if (IsLocalSym) {
+ GotExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext());
+ LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
+ } else {
+ // External symbols fully resolve the symbol with just the %got(symbol)
+ // but we must still account for any offset to the symbol for
+ // expressions like symbol+8.
+ GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT, Res.getSymA(),
+ getContext());
+ if (Res.getConstant() != 0)
+ LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
}
}
- unsigned TmpReg = DstReg;
- if (UseSrcReg &&
- getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
- SrcReg)) {
- // If $rs is the same as $rd, we need to use AT.
- // If it is not available we exit.
- unsigned ATReg = getATReg(IDLoc);
- if (!ATReg)
- return true;
- TmpReg = ATReg;
- }
-
- TOut.emitRRX(Mips::LD, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc,
- STI);
+ TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, TmpReg, GPReg,
+ MCOperand::createExpr(GotExpr), IDLoc, STI);
if (LoExpr)
- TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
- IDLoc, STI);
+ TOut.emitRRX(IsPtr64 ? Mips::DADDiu : Mips::ADDiu, TmpReg, TmpReg,
+ MCOperand::createExpr(LoExpr), IDLoc, STI);
if (UseSrcReg)
- TOut.emitRRR(Mips::DADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
+ TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, TmpReg, SrcReg,
+ IDLoc, STI);
return false;
}
@@ -3289,10 +3294,43 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
return false;
}
-bool MipsAsmParser::expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR,
- bool Is64FPU, SMLoc IDLoc,
- MCStreamer &Out,
- const MCSubtargetInfo *STI) {
+static uint64_t convertIntToDoubleImm(uint64_t ImmOp64) {
+ // If ImmOp64 is AsmToken::Integer type (all bits set to zero in the
+ // exponent field), convert it to double (e.g. 1 to 1.0)
+ if ((Hi_32(ImmOp64) & 0x7ff00000) == 0) {
+ APFloat RealVal(APFloat::IEEEdouble(), ImmOp64);
+ ImmOp64 = RealVal.bitcastToAPInt().getZExtValue();
+ }
+ return ImmOp64;
+}
+
+static uint32_t covertDoubleImmToSingleImm(uint64_t ImmOp64) {
+ // Conversion of a double in an uint64_t to a float in a uint32_t,
+ // retaining the bit pattern of a float.
+ double DoubleImm = BitsToDouble(ImmOp64);
+ float TmpFloat = static_cast<float>(DoubleImm);
+ return FloatToBits(TmpFloat);
+}
+
+bool MipsAsmParser::expandLoadSingleImmToGPR(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ assert(Inst.getNumOperands() == 2 && "Invalid operand count");
+ assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
+ "Invalid instruction operand.");
+
+ unsigned FirstReg = Inst.getOperand(0).getReg();
+ uint64_t ImmOp64 = Inst.getOperand(1).getImm();
+
+ uint32_t ImmOp32 = covertDoubleImmToSingleImm(convertIntToDoubleImm(ImmOp64));
+
+ return loadImmediate(ImmOp32, FirstReg, Mips::NoRegister, true, false, IDLoc,
+ Out, STI);
+}
+
+bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
MipsTargetStreamer &TOut = getTargetStreamer();
assert(Inst.getNumOperands() == 2 && "Invalid operand count");
assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
@@ -3301,166 +3339,184 @@ bool MipsAsmParser::expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR,
unsigned FirstReg = Inst.getOperand(0).getReg();
uint64_t ImmOp64 = Inst.getOperand(1).getImm();
- uint32_t HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32;
- // If ImmOp64 is AsmToken::Integer type (all bits set to zero in the
- // exponent field), convert it to double (e.g. 1 to 1.0)
- if ((HiImmOp64 & 0x7ff00000) == 0) {
- APFloat RealVal(APFloat::IEEEdouble(), ImmOp64);
- ImmOp64 = RealVal.bitcastToAPInt().getZExtValue();
+ ImmOp64 = convertIntToDoubleImm(ImmOp64);
+
+ uint32_t ImmOp32 = covertDoubleImmToSingleImm(ImmOp64);
+
+ unsigned TmpReg = Mips::ZERO;
+ if (ImmOp32 != 0) {
+ TmpReg = getATReg(IDLoc);
+ if (!TmpReg)
+ return true;
}
- uint32_t LoImmOp64 = ImmOp64 & 0xffffffff;
- HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32;
+ if (Lo_32(ImmOp64) == 0) {
+ if (TmpReg != Mips::ZERO && loadImmediate(ImmOp32, TmpReg, Mips::NoRegister,
+ true, false, IDLoc, Out, STI))
+ return true;
+ TOut.emitRR(Mips::MTC1, FirstReg, TmpReg, IDLoc, STI);
+ return false;
+ }
- if (IsSingle) {
- // Conversion of a double in an uint64_t to a float in a uint32_t,
- // retaining the bit pattern of a float.
- uint32_t ImmOp32;
- double doubleImm = BitsToDouble(ImmOp64);
- float tmp_float = static_cast<float>(doubleImm);
- ImmOp32 = FloatToBits(tmp_float);
+ MCSection *CS = getStreamer().getCurrentSectionOnly();
+ // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
+ // where appropriate.
+ MCSection *ReadOnlySection =
+ getContext().getELFSection(".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
- if (IsGPR) {
- if (loadImmediate(ImmOp32, FirstReg, Mips::NoRegister, true, true, IDLoc,
- Out, STI))
- return true;
- return false;
- } else {
- unsigned ATReg = getATReg(IDLoc);
- if (!ATReg)
- return true;
- if (LoImmOp64 == 0) {
- if (loadImmediate(ImmOp32, ATReg, Mips::NoRegister, true, true, IDLoc,
- Out, STI))
- return true;
- TOut.emitRR(Mips::MTC1, FirstReg, ATReg, IDLoc, STI);
- return false;
- }
+ MCSymbol *Sym = getContext().createTempSymbol();
+ const MCExpr *LoSym =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+ const MipsMCExpr *LoExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
- MCSection *CS = getStreamer().getCurrentSectionOnly();
- // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
- // where appropriate.
- MCSection *ReadOnlySection = getContext().getELFSection(
- ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+ getStreamer().SwitchSection(ReadOnlySection);
+ getStreamer().EmitLabel(Sym, IDLoc);
+ getStreamer().EmitIntValue(ImmOp32, 4);
+ getStreamer().SwitchSection(CS);
- MCSymbol *Sym = getContext().createTempSymbol();
- const MCExpr *LoSym =
- MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
- const MipsMCExpr *LoExpr =
- MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+ if (emitPartialAddress(TOut, IDLoc, Sym))
+ return true;
+ TOut.emitRRX(Mips::LWC1, FirstReg, TmpReg, MCOperand::createExpr(LoExpr),
+ IDLoc, STI);
+ return false;
+}
+
+bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
+ MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ assert(Inst.getNumOperands() == 2 && "Invalid operand count");
+ assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
+ "Invalid instruction operand.");
+
+ unsigned FirstReg = Inst.getOperand(0).getReg();
+ uint64_t ImmOp64 = Inst.getOperand(1).getImm();
+
+ ImmOp64 = convertIntToDoubleImm(ImmOp64);
- getStreamer().SwitchSection(ReadOnlySection);
- getStreamer().EmitLabel(Sym, IDLoc);
- getStreamer().EmitIntValue(ImmOp32, 4);
- getStreamer().SwitchSection(CS);
+ if (Lo_32(ImmOp64) == 0) {
+ if (isGP64bit()) {
+ if (loadImmediate(ImmOp64, FirstReg, Mips::NoRegister, false, false,
+ IDLoc, Out, STI))
+ return true;
+ } else {
+ if (loadImmediate(Hi_32(ImmOp64), FirstReg, Mips::NoRegister, true, false,
+ IDLoc, Out, STI))
+ return true;
- if(emitPartialAddress(TOut, IDLoc, Sym))
+ if (loadImmediate(0, nextReg(FirstReg), Mips::NoRegister, true, false,
+ IDLoc, Out, STI))
return true;
- TOut.emitRRX(Mips::LWC1, FirstReg, ATReg,
- MCOperand::createExpr(LoExpr), IDLoc, STI);
}
return false;
}
- // if(!IsSingle)
- unsigned ATReg = getATReg(IDLoc);
- if (!ATReg)
+ MCSection *CS = getStreamer().getCurrentSectionOnly();
+ MCSection *ReadOnlySection =
+ getContext().getELFSection(".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
+ MCSymbol *Sym = getContext().createTempSymbol();
+ const MCExpr *LoSym =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+ const MipsMCExpr *LoExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+
+ getStreamer().SwitchSection(ReadOnlySection);
+ getStreamer().EmitLabel(Sym, IDLoc);
+ getStreamer().EmitValueToAlignment(8);
+ getStreamer().EmitIntValue(ImmOp64, 8);
+ getStreamer().SwitchSection(CS);
+
+ unsigned TmpReg = getATReg(IDLoc);
+ if (!TmpReg)
return true;
- if (IsGPR) {
- if (LoImmOp64 == 0) {
- if(isABI_N32() || isABI_N64()) {
- if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, false, true,
- IDLoc, Out, STI))
- return true;
- return false;
- } else {
- if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, true, true,
- IDLoc, Out, STI))
- return true;
+ if (emitPartialAddress(TOut, IDLoc, Sym))
+ return true;
- if (loadImmediate(0, nextReg(FirstReg), Mips::NoRegister, true, true,
- IDLoc, Out, STI))
- return true;
- return false;
- }
- }
+ TOut.emitRRX(isABI_N64() ? Mips::DADDiu : Mips::ADDiu, TmpReg, TmpReg,
+ MCOperand::createExpr(LoExpr), IDLoc, STI);
- MCSection *CS = getStreamer().getCurrentSectionOnly();
- MCSection *ReadOnlySection = getContext().getELFSection(
- ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+ if (isGP64bit())
+ TOut.emitRRI(Mips::LD, FirstReg, TmpReg, 0, IDLoc, STI);
+ else {
+ TOut.emitRRI(Mips::LW, FirstReg, TmpReg, 0, IDLoc, STI);
+ TOut.emitRRI(Mips::LW, nextReg(FirstReg), TmpReg, 4, IDLoc, STI);
+ }
+ return false;
+}
- MCSymbol *Sym = getContext().createTempSymbol();
- const MCExpr *LoSym =
- MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
- const MipsMCExpr *LoExpr =
- MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
+ SMLoc IDLoc, MCStreamer &Out,
+ const MCSubtargetInfo *STI) {
+ MipsTargetStreamer &TOut = getTargetStreamer();
+ assert(Inst.getNumOperands() == 2 && "Invalid operand count");
+ assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
+ "Invalid instruction operand.");
- getStreamer().SwitchSection(ReadOnlySection);
- getStreamer().EmitLabel(Sym, IDLoc);
- getStreamer().EmitIntValue(HiImmOp64, 4);
- getStreamer().EmitIntValue(LoImmOp64, 4);
- getStreamer().SwitchSection(CS);
+ unsigned FirstReg = Inst.getOperand(0).getReg();
+ uint64_t ImmOp64 = Inst.getOperand(1).getImm();
+
+ ImmOp64 = convertIntToDoubleImm(ImmOp64);
- if(emitPartialAddress(TOut, IDLoc, Sym))
+ unsigned TmpReg = Mips::ZERO;
+ if (ImmOp64 != 0) {
+ TmpReg = getATReg(IDLoc);
+ if (!TmpReg)
return true;
- if(isABI_N64())
- TOut.emitRRX(Mips::DADDiu, ATReg, ATReg,
- MCOperand::createExpr(LoExpr), IDLoc, STI);
- else
- TOut.emitRRX(Mips::ADDiu, ATReg, ATReg,
- MCOperand::createExpr(LoExpr), IDLoc, STI);
+ }
- if(isABI_N32() || isABI_N64())
- TOut.emitRRI(Mips::LD, FirstReg, ATReg, 0, IDLoc, STI);
- else {
- TOut.emitRRI(Mips::LW, FirstReg, ATReg, 0, IDLoc, STI);
- TOut.emitRRI(Mips::LW, nextReg(FirstReg), ATReg, 4, IDLoc, STI);
- }
- return false;
- } else { // if(!IsGPR && !IsSingle)
- if ((LoImmOp64 == 0) &&
- !((HiImmOp64 & 0xffff0000) && (HiImmOp64 & 0x0000ffff))) {
- // FIXME: In the case where the constant is zero, we can load the
- // register directly from the zero register.
- if (loadImmediate(HiImmOp64, ATReg, Mips::NoRegister, true, true, IDLoc,
+ if ((Lo_32(ImmOp64) == 0) &&
+ !((Hi_32(ImmOp64) & 0xffff0000) && (Hi_32(ImmOp64) & 0x0000ffff))) {
+ if (isGP64bit()) {
+ if (TmpReg != Mips::ZERO &&
+ loadImmediate(ImmOp64, TmpReg, Mips::NoRegister, false, false, IDLoc,
Out, STI))
return true;
- if (isABI_N32() || isABI_N64())
- TOut.emitRR(Mips::DMTC1, FirstReg, ATReg, IDLoc, STI);
- else if (hasMips32r2()) {
- TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
- TOut.emitRRR(Mips::MTHC1_D32, FirstReg, FirstReg, ATReg, IDLoc, STI);
- } else {
- TOut.emitRR(Mips::MTC1, nextReg(FirstReg), ATReg, IDLoc, STI);
- TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
- }
+ TOut.emitRR(Mips::DMTC1, FirstReg, TmpReg, IDLoc, STI);
return false;
}
- MCSection *CS = getStreamer().getCurrentSectionOnly();
- // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
- // where appropriate.
- MCSection *ReadOnlySection = getContext().getELFSection(
- ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+ if (TmpReg != Mips::ZERO &&
+ loadImmediate(Hi_32(ImmOp64), TmpReg, Mips::NoRegister, true, false,
+ IDLoc, Out, STI))
+ return true;
- MCSymbol *Sym = getContext().createTempSymbol();
- const MCExpr *LoSym =
- MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
- const MipsMCExpr *LoExpr =
- MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+ if (hasMips32r2()) {
+ TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
+ TOut.emitRRR(Mips::MTHC1_D32, FirstReg, FirstReg, TmpReg, IDLoc, STI);
+ } else {
+ TOut.emitRR(Mips::MTC1, nextReg(FirstReg), TmpReg, IDLoc, STI);
+ TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
+ }
+ return false;
+ }
- getStreamer().SwitchSection(ReadOnlySection);
- getStreamer().EmitLabel(Sym, IDLoc);
- getStreamer().EmitIntValue(HiImmOp64, 4);
- getStreamer().EmitIntValue(LoImmOp64, 4);
- getStreamer().SwitchSection(CS);
+ MCSection *CS = getStreamer().getCurrentSectionOnly();
+ // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
+ // where appropriate.
+ MCSection *ReadOnlySection =
+ getContext().getELFSection(".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
+ MCSymbol *Sym = getContext().createTempSymbol();
+ const MCExpr *LoSym =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+ const MipsMCExpr *LoExpr =
+ MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+
+ getStreamer().SwitchSection(ReadOnlySection);
+ getStreamer().EmitLabel(Sym, IDLoc);
+ getStreamer().EmitValueToAlignment(8);
+ getStreamer().EmitIntValue(ImmOp64, 8);
+ getStreamer().SwitchSection(CS);
+
+ if (emitPartialAddress(TOut, IDLoc, Sym))
+ return true;
+
+ TOut.emitRRX(Is64FPU ? Mips::LDC164 : Mips::LDC1, FirstReg, TmpReg,
+ MCOperand::createExpr(LoExpr), IDLoc, STI);
- if(emitPartialAddress(TOut, IDLoc, Sym))
- return true;
- TOut.emitRRX(Is64FPU ? Mips::LDC164 : Mips::LDC1, FirstReg, ATReg,
- MCOperand::createExpr(LoExpr), IDLoc, STI);
- }
return false;
}
@@ -3489,7 +3545,7 @@ bool MipsAsmParser::expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
} else {
if (!isInt<17>(Offset.getImm()))
return Error(IDLoc, "branch target out of range");
- if (OffsetToAlignment(Offset.getImm(), 1LL << 1))
+ if (offsetToAlignment(Offset.getImm(), Align(2)))
return Error(IDLoc, "branch to misaligned address");
Inst.clear();
Inst.setOpcode(Mips::BEQ_MM);
@@ -3581,7 +3637,6 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
assert(DstRegOp.isReg() && "expected register operand kind");
const MCOperand &BaseRegOp = Inst.getOperand(1);
assert(BaseRegOp.isReg() && "expected register operand kind");
- const MCOperand &OffsetOp = Inst.getOperand(2);
MipsTargetStreamer &TOut = getTargetStreamer();
unsigned DstReg = DstRegOp.getReg();
@@ -3603,6 +3658,26 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
return;
}
+ if (Inst.getNumOperands() > 3) {
+ const MCOperand &BaseRegOp = Inst.getOperand(2);
+ assert(BaseRegOp.isReg() && "expected register operand kind");
+ const MCOperand &ExprOp = Inst.getOperand(3);
+ assert(ExprOp.isExpr() && "expected expression oprand kind");
+
+ unsigned BaseReg = BaseRegOp.getReg();
+ const MCExpr *ExprOffset = ExprOp.getExpr();
+
+ MCOperand LoOperand = MCOperand::createExpr(
+ MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+ MCOperand HiOperand = MCOperand::createExpr(
+ MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+ TOut.emitSCWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
+ LoOperand, TmpReg, IDLoc, STI);
+ return;
+ }
+
+ const MCOperand &OffsetOp = Inst.getOperand(2);
+
if (OffsetOp.isImm()) {
int64_t LoOffset = OffsetOp.getImm() & 0xffff;
int64_t HiOffset = OffsetOp.getImm() & ~0xffff;
@@ -3625,21 +3700,54 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu, TmpReg, TmpReg,
BaseReg, IDLoc, STI);
TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, LoOffset, IDLoc, STI);
- } else {
- assert(OffsetOp.isExpr() && "expected expression operand kind");
- const MCExpr *ExprOffset = OffsetOp.getExpr();
- MCOperand LoOperand = MCOperand::createExpr(
- MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
- MCOperand HiOperand = MCOperand::createExpr(
- MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+ return;
+ }
+
+ if (OffsetOp.isExpr()) {
+ if (inPicMode()) {
+ // FIXME:
+ // c) Check that immediates of R_MIPS_GOT16/R_MIPS_LO16 relocations
+ // do not exceed 16-bit.
+ // d) Use R_MIPS_GOT_PAGE/R_MIPS_GOT_OFST relocations instead
+ // of R_MIPS_GOT_DISP in appropriate cases to reduce number
+ // of GOT entries.
+ MCValue Res;
+ if (!OffsetOp.getExpr()->evaluateAsRelocatable(Res, nullptr, nullptr)) {
+ Error(IDLoc, "expected relocatable expression");
+ return;
+ }
+ if (Res.getSymB() != nullptr) {
+ Error(IDLoc, "expected relocatable expression with only one symbol");
+ return;
+ }
- if (IsLoad)
- TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
- LoOperand, TmpReg, IDLoc, STI);
- else
- TOut.emitStoreWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
- LoOperand, TmpReg, IDLoc, STI);
+ loadAndAddSymbolAddress(Res.getSymA(), TmpReg, BaseReg,
+ !ABI.ArePtrs64bit(), IDLoc, Out, STI);
+ TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, Res.getConstant(), IDLoc,
+ STI);
+ } else {
+ // FIXME: Implement 64-bit case.
+ // 1) lw $8, sym => lui $8, %hi(sym)
+ // lw $8, %lo(sym)($8)
+ // 2) sw $8, sym => lui $at, %hi(sym)
+ // sw $8, %lo(sym)($at)
+ const MCExpr *ExprOffset = OffsetOp.getExpr();
+ MCOperand LoOperand = MCOperand::createExpr(
+ MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+ MCOperand HiOperand = MCOperand::createExpr(
+ MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+
+ // Generate the base address in TmpReg.
+ TOut.emitRX(Mips::LUi, TmpReg, HiOperand, IDLoc, STI);
+ if (BaseReg != Mips::ZERO)
+ TOut.emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
+ // Emit the load or store with the adjusted base and offset.
+ TOut.emitRRX(Inst.getOpcode(), DstReg, TmpReg, LoOperand, IDLoc, STI);
+ }
+ return;
}
+
+ llvm_unreachable("unexpected operand type");
}
bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
@@ -6976,7 +7084,7 @@ bool MipsAsmParser::parseSetPushDirective() {
// Create a copy of the current assembler options environment and push it.
AssemblerOptions.push_back(
- llvm::make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get()));
+ std::make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get()));
getTargetStreamer().emitDirectiveSetPush();
return false;
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index ef13507fe63a..c3e98fe410c1 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -267,6 +267,13 @@ static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
uint64_t Address,
const void *Decoder);
+// DecodeJumpTargetXMM - Decode microMIPS jump and link exchange target,
+// which is shifted left by 2 bit.
+static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
static DecodeStatus DecodeMem(MCInst &Inst,
unsigned Insn,
uint64_t Address,
@@ -2291,6 +2298,15 @@ static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeJumpTargetXMM(MCInst &Inst,
+ unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 2;
+ Inst.addOperand(MCOperand::createImm(JumpOffset));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
unsigned Value,
uint64_t Address,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 859f9cbbca07..70f2a7bdf10f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -304,7 +304,6 @@ Optional<MCFixupKind> MipsAsmBackend::getFixupKind(StringRef Name) const {
return StringSwitch<Optional<MCFixupKind>>(Name)
.Case("R_MIPS_NONE", FK_NONE)
.Case("R_MIPS_32", FK_Data_4)
- .Case("R_MIPS_GOT_PAGE", (MCFixupKind)Mips::fixup_Mips_GOT_PAGE)
.Case("R_MIPS_CALL_HI16", (MCFixupKind)Mips::fixup_Mips_CALL_HI16)
.Case("R_MIPS_CALL_LO16", (MCFixupKind)Mips::fixup_Mips_CALL_LO16)
.Case("R_MIPS_CALL16", (MCFixupKind)Mips::fixup_Mips_CALL16)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 4d7e36995ae4..cca75dfc45c2 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -66,9 +66,9 @@ public:
/// fixupNeedsRelaxation - Target specific predicate for whether a given
/// fixup requires the associated instruction to be relaxed.
- bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
- const MCRelaxableFragment *DF,
- const MCAsmLayout &Layout) const override {
+ bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+ const MCRelaxableFragment *DF,
+ const MCAsmLayout &Layout) const override {
// FIXME.
llvm_unreachable("RelaxInstruction() unimplemented");
return false;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index cf7bae98a27f..cc3168790b98 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -219,7 +219,7 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
const MCFixup &Fixup,
bool IsPCRel) const {
// Determine the type of the relocation.
- unsigned Kind = (unsigned)Fixup.getKind();
+ unsigned Kind = Fixup.getTargetKind();
switch (Kind) {
case FK_NONE:
@@ -690,6 +690,6 @@ llvm::createMipsELFObjectWriter(const Triple &TT, bool IsN32) {
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
bool IsN64 = TT.isArch64Bit() && !IsN32;
bool HasRelocationAddend = TT.isArch64Bit();
- return llvm::make_unique<MipsELFObjectWriter>(OSABI, HasRelocationAddend,
+ return std::make_unique<MipsELFObjectWriter>(OSABI, HasRelocationAddend,
IsN64);
}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 759a7fdb32b8..142e9cebb79e 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -485,8 +485,11 @@ getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
assert(MO.isExpr() &&
"getJumpOffset16OpValue expects only expressions or an immediate");
- // TODO: Push fixup.
- return 0;
+ const MCExpr *Expr = MO.getExpr();
+ Mips::Fixups FixupKind =
+ isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16 : Mips::fixup_Mips_LO16;
+ Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind)));
+ return 0;
}
/// getJumpTargetOpValue - Return binary encoding of the jump
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
index ad5aff6552f6..a84ca8ccfb2d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -10,11 +10,12 @@
#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H
#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/Support/Alignment.h"
namespace llvm {
-// Log2 of the NaCl MIPS sandbox's instruction bundle size.
-static const unsigned MIPS_NACL_BUNDLE_ALIGN = 4u;
+// NaCl MIPS sandbox's instruction bundle size.
+static const Align MIPS_NACL_BUNDLE_ALIGN = Align(16);
bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
bool *IsStore = nullptr);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index ddeec03ba784..79c47d1b6508 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -143,12 +143,15 @@ public:
return false;
switch (Info->get(Inst.getOpcode()).OpInfo[NumOps - 1].OperandType) {
case MCOI::OPERAND_UNKNOWN:
- case MCOI::OPERAND_IMMEDIATE:
- // jal, bal ...
- Target = Inst.getOperand(NumOps - 1).getImm();
+ case MCOI::OPERAND_IMMEDIATE: {
+ // j, jal, jalx, jals
+ // Absolute branch within the current 256 MB-aligned region
+ uint64_t Region = Addr & ~uint64_t(0xfffffff);
+ Target = Region + Inst.getOperand(NumOps - 1).getImm();
return true;
+ }
case MCOI::OPERAND_PCREL:
- // b, j, beq ...
+ // b, beq ...
Target = Addr + Inst.getOperand(NumOps - 1).getImm();
return true;
default:
diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index c050db8a17fd..2d53750ad0ee 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -270,7 +270,7 @@ MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context,
S->getAssembler().setRelaxAll(true);
// Set bundle-alignment as required by the NaCl ABI for the target.
- S->EmitBundleAlignMode(MIPS_NACL_BUNDLE_ALIGN);
+ S->EmitBundleAlignMode(Log2(MIPS_NACL_BUNDLE_ALIGN));
return S;
}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index b4ebb9d18b72..3ff9c722484b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -37,7 +37,7 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS,
ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, 1, "");
MCA.registerSection(*Sec);
- Sec->setAlignment(8);
+ Sec->setAlignment(Align(8));
Streamer->SwitchSection(Sec);
Streamer->EmitIntValue(ELF::ODK_REGINFO, 1); // kind
@@ -55,7 +55,7 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
MCSectionELF *Sec = Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO,
ELF::SHF_ALLOC, 24, "");
MCA.registerSection(*Sec);
- Sec->setAlignment(MTS->getABI().IsN32() ? 8 : 4);
+ Sec->setAlignment(MTS->getABI().IsN32() ? Align(8) : Align(4));
Streamer->SwitchSection(Sec);
Streamer->EmitIntValue(ri_gprmask, 4);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index e3bdb3b140a8..b6dae9f6dea8 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -34,6 +34,15 @@ static cl::opt<bool> RoundSectionSizes(
cl::desc("Round section sizes up to the section alignment"), cl::Hidden);
} // end anonymous namespace
+static bool isMipsR6(const MCSubtargetInfo *STI) {
+ return STI->getFeatureBits()[Mips::FeatureMips32r6] ||
+ STI->getFeatureBits()[Mips::FeatureMips64r6];
+}
+
+static bool isMicroMips(const MCSubtargetInfo *STI) {
+ return STI->getFeatureBits()[Mips::FeatureMicroMips];
+}
+
MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S)
: MCTargetStreamer(S), GPReg(Mips::GP), ModuleDirectiveAllowed(true) {
GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
@@ -216,6 +225,19 @@ void MipsTargetStreamer::emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1,
emitRRX(Opcode, Reg0, Reg1, MCOperand::createReg(Reg2), IDLoc, STI);
}
+void MipsTargetStreamer::emitRRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+ unsigned Reg2, MCOperand Op3, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ MCInst TmpInst;
+ TmpInst.setOpcode(Opcode);
+ TmpInst.addOperand(MCOperand::createReg(Reg0));
+ TmpInst.addOperand(MCOperand::createReg(Reg1));
+ TmpInst.addOperand(MCOperand::createReg(Reg2));
+ TmpInst.addOperand(Op3);
+ TmpInst.setLoc(IDLoc);
+ getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1,
int16_t Imm, SMLoc IDLoc,
const MCSubtargetInfo *STI) {
@@ -264,8 +286,7 @@ void MipsTargetStreamer::emitEmptyDelaySlot(bool hasShortDelaySlot, SMLoc IDLoc,
}
void MipsTargetStreamer::emitNop(SMLoc IDLoc, const MCSubtargetInfo *STI) {
- const FeatureBitset &Features = STI->getFeatureBits();
- if (Features[Mips::FeatureMicroMips])
+ if (isMicroMips(STI))
emitRR(Mips::MOVE16_MM, Mips::ZERO, Mips::ZERO, IDLoc, STI);
else
emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
@@ -311,21 +332,34 @@ void MipsTargetStreamer::emitStoreWithImmOffset(
emitRRI(Opcode, SrcReg, ATReg, LoOffset, IDLoc, STI);
}
-/// Emit a store instruction with an symbol offset. Symbols are assumed to be
-/// out of range for a simm16 will be expanded to appropriate instructions.
-void MipsTargetStreamer::emitStoreWithSymOffset(
- unsigned Opcode, unsigned SrcReg, unsigned BaseReg, MCOperand &HiOperand,
- MCOperand &LoOperand, unsigned ATReg, SMLoc IDLoc,
- const MCSubtargetInfo *STI) {
- // sw $8, sym => lui $at, %hi(sym)
- // sw $8, %lo(sym)($at)
+/// Emit a store instruction with an symbol offset.
+void MipsTargetStreamer::emitSCWithSymOffset(unsigned Opcode, unsigned SrcReg,
+ unsigned BaseReg,
+ MCOperand &HiOperand,
+ MCOperand &LoOperand,
+ unsigned ATReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI) {
+ // sc $8, sym => lui $at, %hi(sym)
+ // sc $8, %lo(sym)($at)
// Generate the base address in ATReg.
emitRX(Mips::LUi, ATReg, HiOperand, IDLoc, STI);
- if (BaseReg != Mips::ZERO)
- emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI);
- // Emit the store with the adjusted base and offset.
- emitRRX(Opcode, SrcReg, ATReg, LoOperand, IDLoc, STI);
+ if (!isMicroMips(STI) && isMipsR6(STI)) {
+ // For non-micromips r6 offset for 'sc' is not in the lower 16 bits so we
+ // put it in 'at'.
+ // sc $8, sym => lui $at, %hi(sym)
+ // addiu $at, $at, %lo(sym)
+ // sc $8, 0($at)
+ emitRRX(Mips::ADDiu, ATReg, ATReg, LoOperand, IDLoc, STI);
+ MCOperand Offset = MCOperand::createImm(0);
+ // Emit the store with the adjusted base and offset.
+ emitRRRX(Opcode, SrcReg, SrcReg, ATReg, Offset, IDLoc, STI);
+ } else {
+ if (BaseReg != Mips::ZERO)
+ emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI);
+ // Emit the store with the adjusted base and offset.
+ emitRRRX(Opcode, SrcReg, SrcReg, ATReg, LoOperand, IDLoc, STI);
+ }
}
/// Emit a load instruction with an immediate offset. DstReg and TmpReg are
@@ -364,30 +398,6 @@ void MipsTargetStreamer::emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg,
emitRRI(Opcode, DstReg, TmpReg, LoOffset, IDLoc, STI);
}
-/// Emit a load instruction with an symbol offset. Symbols are assumed to be
-/// out of range for a simm16 will be expanded to appropriate instructions.
-/// DstReg and TmpReg are permitted to be the same register iff DstReg is a
-/// GPR. It is the callers responsibility to identify such cases and pass the
-/// appropriate register in TmpReg.
-void MipsTargetStreamer::emitLoadWithSymOffset(unsigned Opcode, unsigned DstReg,
- unsigned BaseReg,
- MCOperand &HiOperand,
- MCOperand &LoOperand,
- unsigned TmpReg, SMLoc IDLoc,
- const MCSubtargetInfo *STI) {
- // 1) lw $8, sym => lui $8, %hi(sym)
- // lw $8, %lo(sym)($8)
- // 2) ldc1 $f0, sym => lui $at, %hi(sym)
- // ldc1 $f0, %lo(sym)($at)
-
- // Generate the base address in TmpReg.
- emitRX(Mips::LUi, TmpReg, HiOperand, IDLoc, STI);
- if (BaseReg != Mips::ZERO)
- emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
- // Emit the load with the adjusted base and offset.
- emitRRX(Opcode, DstReg, TmpReg, LoOperand, IDLoc, STI);
-}
-
MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S,
formatted_raw_ostream &OS)
: MipsTargetStreamer(S), OS(OS) {}
@@ -891,9 +901,9 @@ void MipsTargetELFStreamer::finish() {
MCSection &BSSSection = *OFI.getBSSSection();
MCA.registerSection(BSSSection);
- TextSection.setAlignment(std::max(16u, TextSection.getAlignment()));
- DataSection.setAlignment(std::max(16u, DataSection.getAlignment()));
- BSSSection.setAlignment(std::max(16u, BSSSection.getAlignment()));
+ TextSection.setAlignment(Align(std::max(16u, TextSection.getAlignment())));
+ DataSection.setAlignment(Align(std::max(16u, DataSection.getAlignment())));
+ BSSSection.setAlignment(Align(std::max(16u, BSSSection.getAlignment())));
if (RoundSectionSizes) {
// Make sections sizes a multiple of the alignment. This is useful for
@@ -1016,7 +1026,7 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Context);
MCA.registerSection(*Sec);
- Sec->setAlignment(4);
+ Sec->setAlignment(Align(4));
OS.PushSection();
@@ -1306,7 +1316,7 @@ void MipsTargetELFStreamer::emitMipsAbiFlags() {
MCSectionELF *Sec = Context.getELFSection(
".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, ELF::SHF_ALLOC, 24, "");
MCA.registerSection(*Sec);
- Sec->setAlignment(8);
+ Sec->setAlignment(Align(8));
OS.SwitchSection(Sec);
OS << ABIFlagsSection;
diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
index 5a12568893af..9a1e47e5ecca 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -360,7 +360,7 @@ class RDDSP_MM_DESC {
dag OutOperandList = (outs GPR32Opnd:$rt);
dag InOperandList = (ins uimm7:$mask);
string AsmString = !strconcat("rddsp", "\t$rt, $mask");
- list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt7:$mask))];
+ list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp timmZExt7:$mask))];
InstrItinClass Itinerary = NoItinerary;
}
@@ -383,7 +383,7 @@ class WRDSP_MM_DESC {
dag OutOperandList = (outs);
dag InOperandList = (ins GPR32Opnd:$rt, uimm7:$mask);
string AsmString = !strconcat("wrdsp", "\t$rt, $mask");
- list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, immZExt7:$mask)];
+ list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, timmZExt7:$mask)];
InstrItinClass Itinerary = NoItinerary;
bit isMoveReg = 1;
}
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 9b7f7b25fa94..8cc0029fc896 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -955,17 +955,18 @@ let DecoderNamespace = "MicroMips" in {
EXT_FM_MM<0x0c>, ISA_MICROMIPS32_NOT_MIPS32R6;
/// Jump Instructions
- let DecoderMethod = "DecodeJumpTargetMM" in
+ let DecoderMethod = "DecodeJumpTargetMM" in {
def J_MM : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
J_FM_MM<0x35>, AdditionalRequires<[RelocNotPIC]>,
IsBranch, ISA_MICROMIPS32_NOT_MIPS32R6;
-
- let DecoderMethod = "DecodeJumpTargetMM" in {
def JAL_MM : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>,
ISA_MICROMIPS32_NOT_MIPS32R6;
+ }
+
+ let DecoderMethod = "DecodeJumpTargetXMM" in
def JALX_MM : MMRel, JumpLink<"jalx", calltarget>, J_FM_MM<0x3c>,
ISA_MICROMIPS32_NOT_MIPS32R6;
- }
+
def JR_MM : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>,
ISA_MICROMIPS32_NOT_MIPS32R6;
def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>,
diff --git a/lib/Target/Mips/MicroMipsSizeReduction.cpp b/lib/Target/Mips/MicroMipsSizeReduction.cpp
index 70af95592aa5..db93b3d80ede 100644
--- a/lib/Target/Mips/MicroMipsSizeReduction.cpp
+++ b/lib/Target/Mips/MicroMipsSizeReduction.cpp
@@ -361,7 +361,7 @@ static bool CheckXWPInstr(MachineInstr *MI, bool ReduceToLwp,
MI->getOpcode() == Mips::SW16_MM))
return false;
- unsigned reg = MI->getOperand(0).getReg();
+ Register reg = MI->getOperand(0).getReg();
if (reg == Mips::RA)
return false;
@@ -403,8 +403,8 @@ static bool ConsecutiveInstr(MachineInstr *MI1, MachineInstr *MI2) {
if (!GetImm(MI2, 2, Offset2))
return false;
- unsigned Reg1 = MI1->getOperand(0).getReg();
- unsigned Reg2 = MI2->getOperand(0).getReg();
+ Register Reg1 = MI1->getOperand(0).getReg();
+ Register Reg2 = MI2->getOperand(0).getReg();
return ((Offset1 == (Offset2 - 4)) && (ConsecutiveRegisters(Reg1, Reg2)));
}
@@ -475,8 +475,8 @@ bool MicroMipsSizeReduce::ReduceXWtoXWP(ReduceEntryFunArgs *Arguments) {
if (!CheckXWPInstr(MI2, ReduceToLwp, Entry))
return false;
- unsigned Reg1 = MI1->getOperand(1).getReg();
- unsigned Reg2 = MI2->getOperand(1).getReg();
+ Register Reg1 = MI1->getOperand(1).getReg();
+ Register Reg2 = MI2->getOperand(1).getReg();
if (Reg1 != Reg2)
return false;
@@ -621,8 +621,8 @@ bool MicroMipsSizeReduce::ReduceMoveToMovep(ReduceEntryFunArgs *Arguments) {
MachineInstr *MI1 = Arguments->MI;
MachineInstr *MI2 = &*NextMII;
- unsigned RegDstMI1 = MI1->getOperand(0).getReg();
- unsigned RegSrcMI1 = MI1->getOperand(1).getReg();
+ Register RegDstMI1 = MI1->getOperand(0).getReg();
+ Register RegSrcMI1 = MI1->getOperand(1).getReg();
if (!IsMovepSrcRegister(RegSrcMI1))
return false;
@@ -633,8 +633,8 @@ bool MicroMipsSizeReduce::ReduceMoveToMovep(ReduceEntryFunArgs *Arguments) {
if (MI2->getOpcode() != Entry.WideOpc())
return false;
- unsigned RegDstMI2 = MI2->getOperand(0).getReg();
- unsigned RegSrcMI2 = MI2->getOperand(1).getReg();
+ Register RegDstMI2 = MI2->getOperand(0).getReg();
+ Register RegSrcMI2 = MI2->getOperand(1).getReg();
if (!IsMovepSrcRegister(RegSrcMI2))
return false;
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 7b83ea8535ae..a5908362e81f 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -25,6 +25,8 @@ class PredicateControl {
list<Predicate> GPRPredicates = [];
// Predicates for the PTR size such as IsPTR64bit
list<Predicate> PTRPredicates = [];
+ // Predicates for a symbol's size such as hasSym32.
+ list<Predicate> SYMPredicates = [];
// Predicates for the FGR size and layout such as IsFP64bit
list<Predicate> FGRPredicates = [];
// Predicates for the instruction group membership such as ISA's.
@@ -38,6 +40,7 @@ class PredicateControl {
list<Predicate> Predicates = !listconcat(EncodingPredicates,
GPRPredicates,
PTRPredicates,
+ SYMPredicates,
FGRPredicates,
InsnPredicates,
HardFloatPredicate,
@@ -206,6 +209,9 @@ def FeatureMT : SubtargetFeature<"mt", "HasMT", "true", "Mips MT ASE">;
def FeatureLongCalls : SubtargetFeature<"long-calls", "UseLongCalls", "true",
"Disable use of the jal instruction">;
+def FeatureXGOT
+ : SubtargetFeature<"xgot", "UseXGOT", "true", "Assume 32-bit GOT">;
+
def FeatureUseIndirectJumpsHazard : SubtargetFeature<"use-indirect-jump-hazard",
"UseIndirectJumpsHazard",
"true", "Use indirect jump"
@@ -257,3 +263,9 @@ def Mips : Target {
let AssemblyParserVariants = [MipsAsmParserVariant];
let AllowRegisterRenaming = 1;
}
+
+//===----------------------------------------------------------------------===//
+// Pfm Counters
+//===----------------------------------------------------------------------===//
+
+include "MipsPfmCounters.td"
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 3ab4f1e064da..768d54fc9c24 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -72,7 +72,7 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
MachineRegisterInfo &RegInfo = MF.getRegInfo();
const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
DebugLoc DL;
- unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+ Register V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass;
V0 = RegInfo.createVirtualRegister(RC);
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 6d8e5aef2a3f..5a5b78c9d5f9 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -708,8 +708,8 @@ Mips16TargetLowering::emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
if (DontExpandCondPseudos16)
return BB;
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- unsigned regX = MI.getOperand(0).getReg();
- unsigned regY = MI.getOperand(1).getReg();
+ Register regX = MI.getOperand(0).getReg();
+ Register regY = MI.getOperand(1).getReg();
MachineBasicBlock *target = MI.getOperand(2).getMBB();
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(CmpOpc))
.addReg(regX)
@@ -725,7 +725,7 @@ MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
if (DontExpandCondPseudos16)
return BB;
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- unsigned regX = MI.getOperand(0).getReg();
+ Register regX = MI.getOperand(0).getReg();
int64_t imm = MI.getOperand(1).getImm();
MachineBasicBlock *target = MI.getOperand(2).getMBB();
unsigned CmpOpc;
@@ -758,9 +758,9 @@ Mips16TargetLowering::emitFEXT_CCRX16_ins(unsigned SltOpc, MachineInstr &MI,
if (DontExpandCondPseudos16)
return BB;
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- unsigned CC = MI.getOperand(0).getReg();
- unsigned regX = MI.getOperand(1).getReg();
- unsigned regY = MI.getOperand(2).getReg();
+ Register CC = MI.getOperand(0).getReg();
+ Register regX = MI.getOperand(1).getReg();
+ Register regY = MI.getOperand(2).getReg();
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SltOpc))
.addReg(regX)
.addReg(regY);
@@ -777,8 +777,8 @@ Mips16TargetLowering::emitFEXT_CCRXI16_ins(unsigned SltiOpc, unsigned SltiXOpc,
if (DontExpandCondPseudos16)
return BB;
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- unsigned CC = MI.getOperand(0).getReg();
- unsigned regX = MI.getOperand(1).getReg();
+ Register CC = MI.getOperand(0).getReg();
+ Register regX = MI.getOperand(1).getReg();
int64_t Imm = MI.getOperand(2).getImm();
unsigned SltOpc = Mips16WhichOp8uOr16simm(SltiOpc, SltiXOpc, Imm);
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SltOpc)).addReg(regX).addImm(Imm);
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index c234c309d760..0d735c20ec2f 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -358,7 +358,7 @@ unsigned Mips16InstrInfo::loadImmediate(unsigned FrameReg, int64_t Imm,
for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
MachineOperand &MO = II->getOperand(i);
if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
- !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ !Register::isVirtualRegister(MO.getReg()))
Candidates.reset(MO.getReg());
}
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 7f35280f7936..cc15949b0d57 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -16,6 +16,7 @@
// shamt must fit in 6 bits.
def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
+def timmZExt6 : TImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
// Node immediate fits as 10-bit sign extended on target immediate.
// e.g. seqi, snei
@@ -651,6 +652,7 @@ def : MipsPat<(MipsTlsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>,
let AdditionalPredicates = [NotInMicroMips] in {
def : MipsPat<(MipsJmpLink (i64 texternalsym:$dst)),
(JAL texternalsym:$dst)>, ISA_MIPS3, GPR_64, SYM_64;
+
def : MipsPat<(MipsHighest (i64 tglobaladdr:$in)),
(LUi64 tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(MipsHighest (i64 tblockaddress:$in)),
@@ -682,6 +684,20 @@ let AdditionalPredicates = [NotInMicroMips] in {
(DADDiu GPR64:$hi, tjumptable:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tconstpool:$lo))),
(DADDiu GPR64:$hi, tconstpool:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
+ def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 texternalsym:$lo))),
+ (DADDiu GPR64:$hi, texternalsym:$lo)>,
+ ISA_MIPS3, GPR_64, SYM_64;
+
+ def : MipsPat<(MipsHi (i64 tglobaladdr:$in)),
+ (DADDiu ZERO_64, tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+ def : MipsPat<(MipsHi (i64 tblockaddress:$in)),
+ (DADDiu ZERO_64, tblockaddress:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+ def : MipsPat<(MipsHi (i64 tjumptable:$in)),
+ (DADDiu ZERO_64, tjumptable:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+ def : MipsPat<(MipsHi (i64 tconstpool:$in)),
+ (DADDiu ZERO_64, tconstpool:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+ def : MipsPat<(MipsHi (i64 texternalsym:$in)),
+ (DADDiu ZERO_64, texternalsym:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaladdr:$lo))),
(DADDiu GPR64:$hi, tglobaladdr:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
@@ -692,6 +708,23 @@ let AdditionalPredicates = [NotInMicroMips] in {
(DADDiu GPR64:$hi, tjumptable:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tconstpool:$lo))),
(DADDiu GPR64:$hi, tconstpool:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
+ def : MipsPat<(add GPR64:$hi, (MipsHi (i64 texternalsym:$lo))),
+ (DADDiu GPR64:$hi, texternalsym:$lo)>,
+ ISA_MIPS3, GPR_64, SYM_64;
+
+ def : MipsPat<(MipsLo (i64 tglobaladdr:$in)),
+ (DADDiu ZERO_64, tglobaladdr:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+ def : MipsPat<(MipsLo (i64 tblockaddress:$in)),
+ (DADDiu ZERO_64, tblockaddress:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+ def : MipsPat<(MipsLo (i64 tjumptable:$in)),
+ (DADDiu ZERO_64, tjumptable:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+ def : MipsPat<(MipsLo (i64 tconstpool:$in)),
+ (DADDiu ZERO_64, tconstpool:$in)>, ISA_MIPS3, GPR_64, SYM_64;
+ def : MipsPat<(MipsLo (i64 tglobaltlsaddr:$in)),
+ (DADDiu ZERO_64, tglobaltlsaddr:$in)>,
+ ISA_MIPS3, GPR_64, SYM_64;
+ def : MipsPat<(MipsLo (i64 texternalsym:$in)),
+ (DADDiu ZERO_64, texternalsym:$in)>, ISA_MIPS3, GPR_64, SYM_64;
def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaladdr:$lo))),
(DADDiu GPR64:$hi, tglobaladdr:$lo)>, ISA_MIPS3, GPR_64, SYM_64;
@@ -705,6 +738,9 @@ let AdditionalPredicates = [NotInMicroMips] in {
def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaltlsaddr:$lo))),
(DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>, ISA_MIPS3, GPR_64,
SYM_64;
+ def : MipsPat<(add GPR64:$hi, (MipsLo (i64 texternalsym:$lo))),
+ (DADDiu GPR64:$hi, texternalsym:$lo)>,
+ ISA_MIPS3, GPR_64, SYM_64;
}
// gp_rel relocs
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index db83fe49cec0..2201545adc94 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -56,6 +56,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
#include <cassert>
#include <cstdint>
@@ -376,7 +377,7 @@ void MipsAsmPrinter::printSavedRegsBitmask() {
void MipsAsmPrinter::emitFrameDirective() {
const TargetRegisterInfo &RI = *MF->getSubtarget().getRegisterInfo();
- unsigned stackReg = RI.getFrameRegister(*MF);
+ Register stackReg = RI.getFrameRegister(*MF);
unsigned returnReg = RI.getRARegister();
unsigned stackSize = MF->getFrameInfo().getStackSize();
@@ -571,7 +572,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
// for 2 for 32 bit mode and 1 for 64 bit mode.
if (NumVals != 2) {
if (Subtarget->isGP64bit() && NumVals == 1 && MO.isReg()) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
O << '$' << MipsInstPrinter::getRegisterName(Reg);
return false;
}
@@ -597,7 +598,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
const MachineOperand &MO = MI->getOperand(RegOp);
if (!MO.isReg())
return true;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
O << '$' << MipsInstPrinter::getRegisterName(Reg);
return false;
}
@@ -780,7 +781,7 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU());
StringRef FS = TM.getTargetFeatureString();
const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM);
- const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM, 0);
+ const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM, None);
bool IsABICalls = STI.isABICalls();
const MipsABIInfo &ABI = MTM.getABI();
@@ -821,6 +822,9 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
// option has changed the default (i.e. FPXX) and omit it otherwise.
if (ABI.IsO32() && (!STI.useOddSPReg() || STI.isABI_FPXX()))
TS.emitDirectiveModuleOddSPReg();
+
+ // Switch to the .text section.
+ OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
}
void MipsAsmPrinter::emitInlineAsmStart() const {
diff --git a/lib/Target/Mips/MipsCallLowering.cpp b/lib/Target/Mips/MipsCallLowering.cpp
index da65689ecff5..cad82953af50 100644
--- a/lib/Target/Mips/MipsCallLowering.cpp
+++ b/lib/Target/Mips/MipsCallLowering.cpp
@@ -106,6 +106,7 @@ private:
Register ArgsReg, const EVT &VT) override;
virtual void markPhysRegUsed(unsigned PhysReg) {
+ MIRBuilder.getMRI()->addLiveIn(PhysReg);
MIRBuilder.getMBB().addLiveIn(PhysReg);
}
@@ -357,7 +358,7 @@ bool OutgoingValueHandler::handleSplit(SmallVectorImpl<Register> &VRegs,
return true;
}
-static bool isSupportedType(Type *T) {
+static bool isSupportedArgumentType(Type *T) {
if (T->isIntegerTy())
return true;
if (T->isPointerTy())
@@ -367,6 +368,18 @@ static bool isSupportedType(Type *T) {
return false;
}
+static bool isSupportedReturnType(Type *T) {
+ if (T->isIntegerTy())
+ return true;
+ if (T->isPointerTy())
+ return true;
+ if (T->isFloatingPointTy())
+ return true;
+ if (T->isAggregateType())
+ return true;
+ return false;
+}
+
static CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT,
const ISD::ArgFlagsTy &Flags) {
// > does not mean loss of information as type RegisterVT can't hold type VT,
@@ -403,7 +416,7 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(Mips::RetRA);
- if (Val != nullptr && !isSupportedType(Val->getType()))
+ if (Val != nullptr && !isSupportedReturnType(Val->getType()))
return false;
if (!VRegs.empty()) {
@@ -411,21 +424,13 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Function &F = MF.getFunction();
const DataLayout &DL = MF.getDataLayout();
const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
- LLVMContext &Ctx = Val->getType()->getContext();
-
- SmallVector<EVT, 4> SplitEVTs;
- ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
- assert(VRegs.size() == SplitEVTs.size() &&
- "For each split Type there should be exactly one VReg.");
SmallVector<ArgInfo, 8> RetInfos;
SmallVector<unsigned, 8> OrigArgIndices;
- for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
- ArgInfo CurArgInfo = ArgInfo{VRegs[i], SplitEVTs[i].getTypeForEVT(Ctx)};
- setArgFlags(CurArgInfo, AttributeList::ReturnIndex, DL, F);
- splitToValueTypes(CurArgInfo, 0, RetInfos, OrigArgIndices);
- }
+ ArgInfo ArgRetInfo(VRegs, Val->getType());
+ setArgFlags(ArgRetInfo, AttributeList::ReturnIndex, DL, F);
+ splitToValueTypes(DL, ArgRetInfo, 0, RetInfos, OrigArgIndices);
SmallVector<ISD::OutputArg, 8> Outs;
subTargetRegTypeForCallingConv(F, RetInfos, OrigArgIndices, Outs);
@@ -453,12 +458,8 @@ bool MipsCallLowering::lowerFormalArguments(
if (F.arg_empty())
return true;
- if (F.isVarArg()) {
- return false;
- }
-
for (auto &Arg : F.args()) {
- if (!isSupportedType(Arg.getType()))
+ if (!isSupportedArgumentType(Arg.getType()))
return false;
}
@@ -472,7 +473,8 @@ bool MipsCallLowering::lowerFormalArguments(
for (auto &Arg : F.args()) {
ArgInfo AInfo(VRegs[i], Arg.getType());
setArgFlags(AInfo, i + AttributeList::FirstArgIndex, DL, F);
- splitToValueTypes(AInfo, i, ArgInfos, OrigArgIndices);
+ ArgInfos.push_back(AInfo);
+ OrigArgIndices.push_back(i);
++i;
}
@@ -495,30 +497,64 @@ bool MipsCallLowering::lowerFormalArguments(
if (!Handler.handle(ArgLocs, ArgInfos))
return false;
+ if (F.isVarArg()) {
+ ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs();
+ unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs);
+
+ int VaArgOffset;
+ unsigned RegSize = 4;
+ if (ArgRegs.size() == Idx)
+ VaArgOffset = alignTo(CCInfo.getNextStackOffset(), RegSize);
+ else {
+ VaArgOffset =
+ (int)ABI.GetCalleeAllocdArgSizeInBytes(CCInfo.getCallingConv()) -
+ (int)(RegSize * (ArgRegs.size() - Idx));
+ }
+
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ int FI = MFI.CreateFixedObject(RegSize, VaArgOffset, true);
+ MF.getInfo<MipsFunctionInfo>()->setVarArgsFrameIndex(FI);
+
+ for (unsigned I = Idx; I < ArgRegs.size(); ++I, VaArgOffset += RegSize) {
+ MIRBuilder.getMBB().addLiveIn(ArgRegs[I]);
+
+ MachineInstrBuilder Copy =
+ MIRBuilder.buildCopy(LLT::scalar(RegSize * 8), Register(ArgRegs[I]));
+ FI = MFI.CreateFixedObject(RegSize, VaArgOffset, true);
+ MachinePointerInfo MPO = MachinePointerInfo::getFixedStack(MF, FI);
+ MachineInstrBuilder FrameIndex =
+ MIRBuilder.buildFrameIndex(LLT::pointer(MPO.getAddrSpace(), 32), FI);
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, RegSize,
+ /* Alignment */ RegSize);
+ MIRBuilder.buildStore(Copy, FrameIndex, *MMO);
+ }
+ }
+
return true;
}
bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
- CallingConv::ID CallConv,
- const MachineOperand &Callee,
- const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const {
+ CallLoweringInfo &Info) const {
- if (CallConv != CallingConv::C)
+ if (Info.CallConv != CallingConv::C)
return false;
- for (auto &Arg : OrigArgs) {
- if (!isSupportedType(Arg.Ty))
+ for (auto &Arg : Info.OrigArgs) {
+ if (!isSupportedArgumentType(Arg.Ty))
+ return false;
+ if (Arg.Flags[0].isByVal())
return false;
- if (Arg.Flags.isByVal() || Arg.Flags.isSRet())
+ if (Arg.Flags[0].isSRet() && !Arg.Ty->isPointerTy())
return false;
}
- if (OrigRet.Regs[0] && !isSupportedType(OrigRet.Ty))
+ if (!Info.OrigRet.Ty->isVoidTy() && !isSupportedReturnType(Info.OrigRet.Ty))
return false;
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
+ const DataLayout &DL = MF.getDataLayout();
const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
const MipsTargetMachine &TM =
static_cast<const MipsTargetMachine &>(MF.getTarget());
@@ -528,37 +564,38 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
MIRBuilder.buildInstr(Mips::ADJCALLSTACKDOWN);
const bool IsCalleeGlobalPIC =
- Callee.isGlobal() && TM.isPositionIndependent();
+ Info.Callee.isGlobal() && TM.isPositionIndependent();
MachineInstrBuilder MIB = MIRBuilder.buildInstrNoInsert(
- Callee.isReg() || IsCalleeGlobalPIC ? Mips::JALRPseudo : Mips::JAL);
+ Info.Callee.isReg() || IsCalleeGlobalPIC ? Mips::JALRPseudo : Mips::JAL);
MIB.addDef(Mips::SP, RegState::Implicit);
if (IsCalleeGlobalPIC) {
Register CalleeReg =
MF.getRegInfo().createGenericVirtualRegister(LLT::pointer(0, 32));
MachineInstr *CalleeGlobalValue =
- MIRBuilder.buildGlobalValue(CalleeReg, Callee.getGlobal());
- if (!Callee.getGlobal()->hasLocalLinkage())
+ MIRBuilder.buildGlobalValue(CalleeReg, Info.Callee.getGlobal());
+ if (!Info.Callee.getGlobal()->hasLocalLinkage())
CalleeGlobalValue->getOperand(1).setTargetFlags(MipsII::MO_GOT_CALL);
MIB.addUse(CalleeReg);
} else
- MIB.add(Callee);
+ MIB.add(Info.Callee);
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
MIB.addRegMask(TRI->getCallPreservedMask(MF, F.getCallingConv()));
TargetLowering::ArgListTy FuncOrigArgs;
- FuncOrigArgs.reserve(OrigArgs.size());
+ FuncOrigArgs.reserve(Info.OrigArgs.size());
SmallVector<ArgInfo, 8> ArgInfos;
SmallVector<unsigned, 8> OrigArgIndices;
unsigned i = 0;
- for (auto &Arg : OrigArgs) {
+ for (auto &Arg : Info.OrigArgs) {
TargetLowering::ArgListEntry Entry;
Entry.Ty = Arg.Ty;
FuncOrigArgs.push_back(Entry);
- splitToValueTypes(Arg, i, ArgInfos, OrigArgIndices);
+ ArgInfos.push_back(Arg);
+ OrigArgIndices.push_back(i);
++i;
}
@@ -566,11 +603,17 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
subTargetRegTypeForCallingConv(F, ArgInfos, OrigArgIndices, Outs);
SmallVector<CCValAssign, 8> ArgLocs;
- MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
+ bool IsCalleeVarArg = false;
+ if (Info.Callee.isGlobal()) {
+ const Function *CF = static_cast<const Function *>(Info.Callee.getGlobal());
+ IsCalleeVarArg = CF->isVarArg();
+ }
+ MipsCCState CCInfo(F.getCallingConv(), IsCalleeVarArg, MF, ArgLocs,
F.getContext());
- CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
- const char *Call = Callee.isSymbol() ? Callee.getSymbolName() : nullptr;
+ CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(Info.CallConv), 1);
+ const char *Call =
+ Info.Callee.isSymbol() ? Info.Callee.getSymbolName() : nullptr;
CCInfo.AnalyzeCallOperands(Outs, TLI.CCAssignFnForCall(), FuncOrigArgs, Call);
setLocInfo(ArgLocs, Outs);
@@ -599,11 +642,11 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
*STI.getRegBankInfo());
}
- if (OrigRet.Regs[0]) {
+ if (!Info.OrigRet.Ty->isVoidTy()) {
ArgInfos.clear();
SmallVector<unsigned, 8> OrigRetIndices;
- splitToValueTypes(OrigRet, 0, ArgInfos, OrigRetIndices);
+ splitToValueTypes(DL, Info.OrigRet, 0, ArgInfos, OrigRetIndices);
SmallVector<ISD::InputArg, 8> Ins;
subTargetRegTypeForCallingConv(F, ArgInfos, OrigRetIndices, Ins);
@@ -612,7 +655,7 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
MipsCCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs,
F.getContext());
- CCInfo.AnalyzeCallResult(Ins, TLI.CCAssignFnForReturn(), OrigRet.Ty, Call);
+ CCInfo.AnalyzeCallResult(Ins, TLI.CCAssignFnForReturn(), Info.OrigRet.Ty, Call);
setLocInfo(ArgLocs, Ins);
CallReturnHandler Handler(MIRBuilder, MF.getRegInfo(), MIB);
@@ -642,12 +685,12 @@ void MipsCallLowering::subTargetRegTypeForCallingConv(
F.getContext(), F.getCallingConv(), VT);
for (unsigned i = 0; i < NumRegs; ++i) {
- ISD::ArgFlagsTy Flags = Arg.Flags;
+ ISD::ArgFlagsTy Flags = Arg.Flags[0];
if (i == 0)
Flags.setOrigAlign(TLI.getABIAlignmentForCallingConv(Arg.Ty, DL));
else
- Flags.setOrigAlign(1);
+ Flags.setOrigAlign(Align::None());
ISDArgs.emplace_back(Flags, RegisterVT, VT, true, OrigArgIndices[ArgNo],
0);
@@ -657,12 +700,21 @@ void MipsCallLowering::subTargetRegTypeForCallingConv(
}
void MipsCallLowering::splitToValueTypes(
- const ArgInfo &OrigArg, unsigned OriginalIndex,
+ const DataLayout &DL, const ArgInfo &OrigArg, unsigned OriginalIndex,
SmallVectorImpl<ArgInfo> &SplitArgs,
SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const {
- // TODO : perform structure and array split. For now we only deal with
- // types that pass isSupportedType check.
- SplitArgs.push_back(OrigArg);
- SplitArgsOrigIndices.push_back(OriginalIndex);
+ SmallVector<EVT, 4> SplitEVTs;
+ SmallVector<Register, 4> SplitVRegs;
+ const MipsTargetLowering &TLI = *getTLI<MipsTargetLowering>();
+ LLVMContext &Ctx = OrigArg.Ty->getContext();
+
+ ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitEVTs);
+
+ for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
+ ArgInfo Info = ArgInfo{OrigArg.Regs[i], SplitEVTs[i].getTypeForEVT(Ctx)};
+ Info.Flags = OrigArg.Flags;
+ SplitArgs.push_back(Info);
+ SplitArgsOrigIndices.push_back(OriginalIndex);
+ }
}
diff --git a/lib/Target/Mips/MipsCallLowering.h b/lib/Target/Mips/MipsCallLowering.h
index 11c2d53ad35d..a284cf5e26cf 100644
--- a/lib/Target/Mips/MipsCallLowering.h
+++ b/lib/Target/Mips/MipsCallLowering.h
@@ -68,9 +68,8 @@ public:
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const override;
- bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
- const MachineOperand &Callee, const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const override;
+ bool lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const override;
private:
/// Based on registers available on target machine split or extend
@@ -83,7 +82,8 @@ private:
/// Split structures and arrays, save original argument indices since
/// Mips calling convention needs info about original argument type.
- void splitToValueTypes(const ArgInfo &OrigArg, unsigned OriginalIndex,
+ void splitToValueTypes(const DataLayout &DL, const ArgInfo &OrigArg,
+ unsigned OriginalIndex,
SmallVectorImpl<ArgInfo> &SplitArgs,
SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const;
};
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index eea28df7eda1..f50640521738 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -222,12 +222,7 @@ namespace {
BasicBlockInfo() = default;
- // FIXME: ignore LogAlign for this patch
- //
- unsigned postOffset(unsigned LogAlign = 0) const {
- unsigned PO = Offset + Size;
- return PO;
- }
+ unsigned postOffset() const { return Offset + Size; }
};
std::vector<BasicBlockInfo> BBInfo;
@@ -376,7 +371,7 @@ namespace {
void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
- unsigned getCPELogAlign(const MachineInstr &CPEMI);
+ Align getCPEAlign(const MachineInstr &CPEMI);
void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
unsigned getOffsetOf(MachineInstr *MI) const;
unsigned getUserOffset(CPUser&) const;
@@ -534,11 +529,11 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
MF->push_back(BB);
// MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
- unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+ const Align MaxAlign(MCP->getConstantPoolAlignment());
// Mark the basic block as required by the const-pool.
// If AlignConstantIslands isn't set, use 4-byte alignment for everything.
- BB->setAlignment(AlignConstantIslands ? MaxAlign : 2);
+ BB->setAlignment(AlignConstantIslands ? MaxAlign : Align(4));
// The function needs to be as aligned as the basic blocks. The linker may
// move functions around based on their alignment.
@@ -548,7 +543,8 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
// alignment of all entries as long as BB is sufficiently aligned. Keep
// track of the insertion point for each alignment. We are going to bucket
// sort the entries as they are created.
- SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+ SmallVector<MachineBasicBlock::iterator, 8> InsPoint(Log2(MaxAlign) + 1,
+ BB->end());
// Add all of the constants from the constant pool to the end block, use an
// identity mapping of CPI's to CPE's.
@@ -576,7 +572,7 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
// Ensure that future entries with higher alignment get inserted before
// CPEMI. This is bucket sort with iterators.
- for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+ for (unsigned a = LogAlign + 1; a <= Log2(MaxAlign); ++a)
if (InsPoint[a] == InsAt)
InsPoint[a] = CPEMI;
// Add a new CPEntry, but no corresponding CPUser yet.
@@ -621,20 +617,18 @@ MipsConstantIslands::CPEntry
return nullptr;
}
-/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// getCPEAlign - Returns the required alignment of the constant pool entry
/// represented by CPEMI. Alignment is measured in log2(bytes) units.
-unsigned MipsConstantIslands::getCPELogAlign(const MachineInstr &CPEMI) {
+Align MipsConstantIslands::getCPEAlign(const MachineInstr &CPEMI) {
assert(CPEMI.getOpcode() == Mips::CONSTPOOL_ENTRY);
// Everything is 4-byte aligned unless AlignConstantIslands is set.
if (!AlignConstantIslands)
- return 2;
+ return Align(4);
unsigned CPI = CPEMI.getOperand(1).getIndex();
assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
- unsigned Align = MCP->getConstants()[CPI].getAlignment();
- assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
- return Log2_32(Align);
+ return Align(MCP->getConstants()[CPI].getAlignment());
}
/// initializeFunctionInfo - Do the initial scan of the function, building up
@@ -940,13 +934,13 @@ bool MipsConstantIslands::isOffsetInRange(unsigned UserOffset,
bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
MachineBasicBlock* Water, CPUser &U,
unsigned &Growth) {
- unsigned CPELogAlign = getCPELogAlign(*U.CPEMI);
- unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
- unsigned NextBlockOffset, NextBlockAlignment;
+ unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset();
+ unsigned NextBlockOffset;
+ Align NextBlockAlignment;
MachineFunction::const_iterator NextBlock = ++Water->getIterator();
if (NextBlock == MF->end()) {
NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
- NextBlockAlignment = 0;
+ NextBlockAlignment = Align::None();
} else {
NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
NextBlockAlignment = NextBlock->getAlignment();
@@ -961,7 +955,7 @@ bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
Growth = CPEEnd - NextBlockOffset;
// Compute the padding that would go at the end of the CPE to align the next
// block.
- Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment);
+ Growth += offsetToAlignment(CPEEnd, NextBlockAlignment);
// If the CPE is to be inserted before the instruction, that will raise
// the offset of the instruction. Also account for unknown alignment padding
@@ -1221,7 +1215,6 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
CPUser &U = CPUsers[CPUserIndex];
MachineInstr *UserMI = U.MI;
MachineInstr *CPEMI = U.CPEMI;
- unsigned CPELogAlign = getCPELogAlign(*CPEMI);
MachineBasicBlock *UserMBB = UserMI->getParent();
const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
@@ -1231,7 +1224,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
// Size of branch to insert.
unsigned Delta = 2;
// Compute the offset where the CPE will begin.
- unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+ unsigned CPEOffset = UserBBI.postOffset() + Delta;
if (isOffsetInRange(UserOffset, CPEOffset, U)) {
LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
@@ -1257,9 +1250,8 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
// Try to split the block so it's fully aligned. Compute the latest split
// point where we can add a 4-byte branch instruction, and then align to
- // LogAlign which is the largest possible alignment in the function.
- unsigned LogAlign = MF->getAlignment();
- assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+ // Align which is the largest possible alignment in the function.
+ const Align Align = MF->getAlignment();
unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x",
BaseInsertOffset));
@@ -1270,7 +1262,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
BaseInsertOffset -= 4;
LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
- << " la=" << LogAlign << '\n');
+ << " la=" << Log2(Align) << '\n');
// This could point off the end of the block if we've already got constant
// pool entries following this block; only the last one is in the water list.
@@ -1295,8 +1287,8 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
CPUser &U = CPUsers[CPUIndex];
if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
// Shift intertion point by one unit of alignment so it is within reach.
- BaseInsertOffset -= 1u << LogAlign;
- EndInsertOffset -= 1u << LogAlign;
+ BaseInsertOffset -= Align.value();
+ EndInsertOffset -= Align.value();
}
// This is overly conservative, as we don't account for CPEMIs being
// reused within the block, but it doesn't matter much. Also assume CPEs
@@ -1399,7 +1391,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
++NumCPEs;
// Mark the basic block as aligned as required by the const-pool entry.
- NewIsland->setAlignment(getCPELogAlign(*U.CPEMI));
+ NewIsland->setAlignment(getCPEAlign(*U.CPEMI));
// Increase the size of the island block to account for the new entry.
BBInfo[NewIsland->getNumber()].Size += Size;
@@ -1431,10 +1423,11 @@ void MipsConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
BBInfo[CPEBB->getNumber()].Size = 0;
// This block no longer needs to be aligned.
- CPEBB->setAlignment(0);
- } else
+ CPEBB->setAlignment(Align(1));
+ } else {
// Entries are sorted by descending alignment, so realign from the front.
- CPEBB->setAlignment(getCPELogAlign(*CPEBB->begin()));
+ CPEBB->setAlignment(getCPEAlign(*CPEBB->begin()));
+ }
adjustBBOffsetsAfter(CPEBB);
// An island has only one predecessor BB and one successor BB. Check if
@@ -1529,7 +1522,7 @@ MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
// We should have a way to back out this alignment restriction if we "can" later.
// but it is not harmful.
//
- DestBB->setAlignment(2);
+ DestBB->setAlignment(Align(4));
Br.MaxDisp = ((1<<24)-1) * 2;
MI->setDesc(TII->get(Mips::JalB16));
}
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index daca8b907081..d3e68c014fb7 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -12,12 +12,19 @@
// ImmLeaf
def immZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}]>;
+def timmZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}], NOOP_SDNodeXForm, timm>;
def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>;
+def timmZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}], NOOP_SDNodeXForm, timm>;
def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>;
+def timmZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}], NOOP_SDNodeXForm, timm>;
def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>;
+def timmZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}], NOOP_SDNodeXForm, timm>;
def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>;
+def timmZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}], NOOP_SDNodeXForm, timm>;
def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>;
+def timmZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}], NOOP_SDNodeXForm, timm>;
def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>;
+def timmSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}], NOOP_SDNodeXForm, timm>;
def immSExt10 : ImmLeaf<i32, [{return isInt<10>(Imm);}]>;
// Mips-specific dsp nodes
@@ -306,7 +313,7 @@ class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
dag OutOperandList = (outs ROT:$rt);
dag InOperandList = (ins ROS:$rs, uimm5:$sa, ROS:$src);
string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
- list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))];
+ list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, timmZExt5:$sa))];
InstrItinClass Itinerary = itin;
string Constraints = "$src = $rt";
string BaseOpcode = instr_asm;
@@ -443,7 +450,7 @@ class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
dag OutOperandList = (outs GPR32Opnd:$rd);
dag InOperandList = (ins uimm10:$mask);
string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
- list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
+ list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode timmZExt10:$mask))];
InstrItinClass Itinerary = itin;
string BaseOpcode = instr_asm;
bit isMoveReg = 1;
@@ -454,7 +461,7 @@ class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
dag OutOperandList = (outs);
dag InOperandList = (ins GPR32Opnd:$rs, uimm10:$mask);
string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
- list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)];
+ list<dag> Pattern = [(OpNode GPR32Opnd:$rs, timmZExt10:$mask)];
InstrItinClass Itinerary = itin;
string BaseOpcode = instr_asm;
bit isMoveReg = 1;
@@ -1096,14 +1103,14 @@ class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
NoItinerary, DSPROpnd>;
// Misc
-class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, uimm5, immZExt5,
+class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, uimm5, timmZExt5,
NoItinerary>;
-class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, uimm2, immZExt2,
+class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, uimm2, timmZExt2,
NoItinerary>;
class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, uimm5,
- immZExt5, NoItinerary>;
+ timmZExt5, NoItinerary>;
// Pseudos.
def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32,
diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp
index 65d84a6c44a0..00cd284e7094 100644
--- a/lib/Target/Mips/MipsExpandPseudo.cpp
+++ b/lib/Target/Mips/MipsExpandPseudo.cpp
@@ -99,15 +99,15 @@ bool MipsExpandPseudo::expandAtomicCmpSwapSubword(
: (ArePtrs64bit ? Mips::SC64 : Mips::SC);
}
- unsigned Dest = I->getOperand(0).getReg();
- unsigned Ptr = I->getOperand(1).getReg();
- unsigned Mask = I->getOperand(2).getReg();
- unsigned ShiftCmpVal = I->getOperand(3).getReg();
- unsigned Mask2 = I->getOperand(4).getReg();
- unsigned ShiftNewVal = I->getOperand(5).getReg();
- unsigned ShiftAmnt = I->getOperand(6).getReg();
- unsigned Scratch = I->getOperand(7).getReg();
- unsigned Scratch2 = I->getOperand(8).getReg();
+ Register Dest = I->getOperand(0).getReg();
+ Register Ptr = I->getOperand(1).getReg();
+ Register Mask = I->getOperand(2).getReg();
+ Register ShiftCmpVal = I->getOperand(3).getReg();
+ Register Mask2 = I->getOperand(4).getReg();
+ Register ShiftNewVal = I->getOperand(5).getReg();
+ Register ShiftAmnt = I->getOperand(6).getReg();
+ Register Scratch = I->getOperand(7).getReg();
+ Register Scratch2 = I->getOperand(8).getReg();
// insert new blocks after the current block
const BasicBlock *LLVM_BB = BB.getBasicBlock();
@@ -240,11 +240,11 @@ bool MipsExpandPseudo::expandAtomicCmpSwap(MachineBasicBlock &BB,
MOVE = Mips::OR64;
}
- unsigned Dest = I->getOperand(0).getReg();
- unsigned Ptr = I->getOperand(1).getReg();
- unsigned OldVal = I->getOperand(2).getReg();
- unsigned NewVal = I->getOperand(3).getReg();
- unsigned Scratch = I->getOperand(4).getReg();
+ Register Dest = I->getOperand(0).getReg();
+ Register Ptr = I->getOperand(1).getReg();
+ Register OldVal = I->getOperand(2).getReg();
+ Register NewVal = I->getOperand(3).getReg();
+ Register Scratch = I->getOperand(4).getReg();
// insert new blocks after the current block
const BasicBlock *LLVM_BB = BB.getBasicBlock();
@@ -374,15 +374,15 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword(
llvm_unreachable("Unknown subword atomic pseudo for expansion!");
}
- unsigned Dest = I->getOperand(0).getReg();
- unsigned Ptr = I->getOperand(1).getReg();
- unsigned Incr = I->getOperand(2).getReg();
- unsigned Mask = I->getOperand(3).getReg();
- unsigned Mask2 = I->getOperand(4).getReg();
- unsigned ShiftAmnt = I->getOperand(5).getReg();
- unsigned OldVal = I->getOperand(6).getReg();
- unsigned BinOpRes = I->getOperand(7).getReg();
- unsigned StoreVal = I->getOperand(8).getReg();
+ Register Dest = I->getOperand(0).getReg();
+ Register Ptr = I->getOperand(1).getReg();
+ Register Incr = I->getOperand(2).getReg();
+ Register Mask = I->getOperand(3).getReg();
+ Register Mask2 = I->getOperand(4).getReg();
+ Register ShiftAmnt = I->getOperand(5).getReg();
+ Register OldVal = I->getOperand(6).getReg();
+ Register BinOpRes = I->getOperand(7).getReg();
+ Register StoreVal = I->getOperand(8).getReg();
const BasicBlock *LLVM_BB = BB.getBasicBlock();
MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -513,10 +513,10 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB,
BEQ = Mips::BEQ64;
}
- unsigned OldVal = I->getOperand(0).getReg();
- unsigned Ptr = I->getOperand(1).getReg();
- unsigned Incr = I->getOperand(2).getReg();
- unsigned Scratch = I->getOperand(3).getReg();
+ Register OldVal = I->getOperand(0).getReg();
+ Register Ptr = I->getOperand(1).getReg();
+ Register Incr = I->getOperand(2).getReg();
+ Register Scratch = I->getOperand(3).getReg();
unsigned Opcode = 0;
unsigned OR = 0;
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 123d3cc242f0..80f288ac500c 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -1162,14 +1162,20 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
if (ArgVT == MVT::f32) {
VA.convertToReg(Mips::F12);
} else if (ArgVT == MVT::f64) {
- VA.convertToReg(Mips::D6);
+ if (Subtarget->isFP64bit())
+ VA.convertToReg(Mips::D6_64);
+ else
+ VA.convertToReg(Mips::D6);
}
} else if (i == 1) {
if ((firstMVT == MVT::f32) || (firstMVT == MVT::f64)) {
if (ArgVT == MVT::f32) {
VA.convertToReg(Mips::F14);
} else if (ArgVT == MVT::f64) {
- VA.convertToReg(Mips::D7);
+ if (Subtarget->isFP64bit())
+ VA.convertToReg(Mips::D7_64);
+ else
+ VA.convertToReg(Mips::D7);
}
}
}
@@ -1722,7 +1728,7 @@ bool MipsFastISel::selectRet(const Instruction *I) {
return false;
unsigned SrcReg = Reg + VA.getValNo();
- unsigned DestReg = VA.getLocReg();
+ Register DestReg = VA.getLocReg();
// Avoid a cross-class copy. This is very unlikely.
if (!MRI.getRegClass(SrcReg)->contains(DestReg))
return false;
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 0537cfd1cb30..612b2b712fa8 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -24,8 +24,9 @@ protected:
const MipsSubtarget &STI;
public:
- explicit MipsFrameLowering(const MipsSubtarget &sti, unsigned Alignment)
- : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) {}
+ explicit MipsFrameLowering(const MipsSubtarget &sti, Align Alignment)
+ : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) {
+ }
static const MipsFrameLowering *create(const MipsSubtarget &ST);
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 9ba54d6bb73c..e5997af3bcc5 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -65,7 +65,7 @@ bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
/// getGlobalBaseReg - Output the instructions required to put the
/// GOT address into a register.
SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
- unsigned GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
+ Register GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
return CurDAG->getRegister(GlobalBaseReg, getTargetLowering()->getPointerTy(
CurDAG->getDataLayout()))
.getNode();
@@ -217,6 +217,51 @@ bool MipsDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
return false;
}
+/// Convert vector addition with vector subtraction if that allows to encode
+/// constant as an immediate and thus avoid extra 'ldi' instruction.
+/// add X, <-1, -1...> --> sub X, <1, 1...>
+bool MipsDAGToDAGISel::selectVecAddAsVecSubIfProfitable(SDNode *Node) {
+ assert(Node->getOpcode() == ISD::ADD && "Should only get 'add' here.");
+
+ EVT VT = Node->getValueType(0);
+ assert(VT.isVector() && "Should only be called for vectors.");
+
+ SDValue X = Node->getOperand(0);
+ SDValue C = Node->getOperand(1);
+
+ auto *BVN = dyn_cast<BuildVectorSDNode>(C);
+ if (!BVN)
+ return false;
+
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+
+ if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+ 8, !Subtarget->isLittle()))
+ return false;
+
+ auto IsInlineConstant = [](const APInt &Imm) { return Imm.isIntN(5); };
+
+ if (IsInlineConstant(SplatValue))
+ return false; // Can already be encoded as an immediate.
+
+ APInt NegSplatValue = 0 - SplatValue;
+ if (!IsInlineConstant(NegSplatValue))
+ return false; // Even if we negate it it won't help.
+
+ SDLoc DL(Node);
+
+ SDValue NegC = CurDAG->FoldConstantArithmetic(
+ ISD::SUB, DL, VT, CurDAG->getConstant(0, DL, VT).getNode(), C.getNode());
+ assert(NegC && "Constant-folding failed!");
+ SDValue NewNode = CurDAG->getNode(ISD::SUB, DL, VT, X, NegC);
+
+ ReplaceNode(Node, NewNode.getNode());
+ SelectCode(NewNode.getNode());
+ return true;
+}
+
/// Select instructions not customized! Used for
/// expanded, promoted and normal instructions
void MipsDAGToDAGISel::Select(SDNode *Node) {
@@ -236,6 +281,12 @@ void MipsDAGToDAGISel::Select(SDNode *Node) {
switch(Opcode) {
default: break;
+ case ISD::ADD:
+ if (Node->getSimpleValueType(0).isVector() &&
+ selectVecAddAsVecSubIfProfitable(Node))
+ return;
+ break;
+
// Get target GOT address.
case ISD::GLOBAL_OFFSET_TABLE:
ReplaceNode(Node, getGlobalBaseReg());
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index bae3bbf71f3b..a768589b374b 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -125,6 +125,11 @@ private:
/// starting at bit zero.
virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
+ /// Convert vector addition with vector subtraction if that allows to encode
+ /// constant as an immediate and thus avoid extra 'ldi' instruction.
+ /// add X, <-1, -1...> --> sub X, <1, 1...>
+ bool selectVecAddAsVecSubIfProfitable(SDNode *Node);
+
void Select(SDNode *N) override;
virtual bool trySelect(SDNode *Node) = 0;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 0ff09007da4b..bf1b4756b24f 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -83,10 +83,6 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
static cl::opt<bool>
-LargeGOT("mxgot", cl::Hidden,
- cl::desc("MIPS: Enable GOT larger than 64k."), cl::init(false));
-
-static cl::opt<bool>
NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
cl::desc("MIPS: Don't trap on integer division by zero."),
cl::init(false));
@@ -330,7 +326,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
}
// Set LoadExtAction for f16 vectors to Expand
- for (MVT VT : MVT::fp_vector_valuetypes()) {
+ for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
MVT F16VT = MVT::getVectorVT(MVT::f16, VT.getVectorNumElements());
if (F16VT.isValid())
setLoadExtAction(ISD::EXTLOAD, VT, F16VT, Expand);
@@ -518,11 +514,12 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
setLibcallName(RTLIB::SRA_I128, nullptr);
}
- setMinFunctionAlignment(Subtarget.isGP64bit() ? 3 : 2);
+ setMinFunctionAlignment(Subtarget.isGP64bit() ? Align(8) : Align(4));
// The arguments on the stack are defined in terms of 4-byte slots on O32
// and 8-byte slots on N32/N64.
- setMinStackArgumentAlignment((ABI.IsN32() || ABI.IsN64()) ? 8 : 4);
+ setMinStackArgumentAlignment((ABI.IsN32() || ABI.IsN64()) ? Align(8)
+ : Align(4));
setStackPointerRegisterToSaveRestore(ABI.IsN64() ? Mips::SP_64 : Mips::SP);
@@ -552,8 +549,9 @@ MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
!Subtarget.inMicroMipsMode();
// Disable if either of the following is true:
- // We do not generate PIC, the ABI is not O32, LargeGOT is being used.
- if (!TM.isPositionIndependent() || !TM.getABI().IsO32() || LargeGOT)
+ // We do not generate PIC, the ABI is not O32, XGOT is being used.
+ if (!TM.isPositionIndependent() || !TM.getABI().IsO32() ||
+ Subtarget.useXGOT())
UseFastISel = false;
return UseFastISel ? Mips::createFastISel(funcInfo, libInfo) : nullptr;
@@ -1257,7 +1255,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
static unsigned
addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
{
- unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
+ Register VReg = MF.getRegInfo().createVirtualRegister(RC);
MF.getRegInfo().addLiveIn(PReg, VReg);
return VReg;
}
@@ -1477,10 +1475,10 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
llvm_unreachable("Unknown pseudo atomic for replacement!");
}
- unsigned OldVal = MI.getOperand(0).getReg();
- unsigned Ptr = MI.getOperand(1).getReg();
- unsigned Incr = MI.getOperand(2).getReg();
- unsigned Scratch = RegInfo.createVirtualRegister(RegInfo.getRegClass(OldVal));
+ Register OldVal = MI.getOperand(0).getReg();
+ Register Ptr = MI.getOperand(1).getReg();
+ Register Incr = MI.getOperand(2).getReg();
+ Register Scratch = RegInfo.createVirtualRegister(RegInfo.getRegClass(OldVal));
MachineBasicBlock::iterator II(MI);
@@ -1519,8 +1517,8 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
// containing the word.
//
- unsigned PtrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Ptr));
- unsigned IncrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Incr));
+ Register PtrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Ptr));
+ Register IncrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Incr));
BuildMI(*BB, II, DL, TII->get(Mips::COPY), IncrCopy).addReg(Incr);
BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr);
@@ -1556,7 +1554,7 @@ MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &RegInfo = MF->getRegInfo();
const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
- unsigned ScrReg = RegInfo.createVirtualRegister(RC);
+ Register ScrReg = RegInfo.createVirtualRegister(RC);
assert(Size < 32);
int64_t ShiftImm = 32 - (Size * 8);
@@ -1581,21 +1579,21 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Dest = MI.getOperand(0).getReg();
- unsigned Ptr = MI.getOperand(1).getReg();
- unsigned Incr = MI.getOperand(2).getReg();
-
- unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp);
- unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
- unsigned Mask = RegInfo.createVirtualRegister(RC);
- unsigned Mask2 = RegInfo.createVirtualRegister(RC);
- unsigned Incr2 = RegInfo.createVirtualRegister(RC);
- unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
- unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
- unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
- unsigned Scratch = RegInfo.createVirtualRegister(RC);
- unsigned Scratch2 = RegInfo.createVirtualRegister(RC);
- unsigned Scratch3 = RegInfo.createVirtualRegister(RC);
+ Register Dest = MI.getOperand(0).getReg();
+ Register Ptr = MI.getOperand(1).getReg();
+ Register Incr = MI.getOperand(2).getReg();
+
+ Register AlignedAddr = RegInfo.createVirtualRegister(RCp);
+ Register ShiftAmt = RegInfo.createVirtualRegister(RC);
+ Register Mask = RegInfo.createVirtualRegister(RC);
+ Register Mask2 = RegInfo.createVirtualRegister(RC);
+ Register Incr2 = RegInfo.createVirtualRegister(RC);
+ Register MaskLSB2 = RegInfo.createVirtualRegister(RCp);
+ Register PtrLSB2 = RegInfo.createVirtualRegister(RC);
+ Register MaskUpper = RegInfo.createVirtualRegister(RC);
+ Register Scratch = RegInfo.createVirtualRegister(RC);
+ Register Scratch2 = RegInfo.createVirtualRegister(RC);
+ Register Scratch3 = RegInfo.createVirtualRegister(RC);
unsigned AtomicOp = 0;
switch (MI.getOpcode()) {
@@ -1678,7 +1676,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
if (Subtarget.isLittle()) {
BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
} else {
- unsigned Off = RegInfo.createVirtualRegister(RC);
+ Register Off = RegInfo.createVirtualRegister(RC);
BuildMI(BB, DL, TII->get(Mips::XORi), Off)
.addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
@@ -1738,12 +1736,12 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
unsigned AtomicOp = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32
? Mips::ATOMIC_CMP_SWAP_I32_POSTRA
: Mips::ATOMIC_CMP_SWAP_I64_POSTRA;
- unsigned Dest = MI.getOperand(0).getReg();
- unsigned Ptr = MI.getOperand(1).getReg();
- unsigned OldVal = MI.getOperand(2).getReg();
- unsigned NewVal = MI.getOperand(3).getReg();
+ Register Dest = MI.getOperand(0).getReg();
+ Register Ptr = MI.getOperand(1).getReg();
+ Register OldVal = MI.getOperand(2).getReg();
+ Register NewVal = MI.getOperand(3).getReg();
- unsigned Scratch = MRI.createVirtualRegister(RC);
+ Register Scratch = MRI.createVirtualRegister(RC);
MachineBasicBlock::iterator II(MI);
// We need to create copies of the various registers and kill them at the
@@ -1751,9 +1749,9 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
// after fast register allocation, the spills will end up outside of the
// blocks that their values are defined in, causing livein errors.
- unsigned PtrCopy = MRI.createVirtualRegister(MRI.getRegClass(Ptr));
- unsigned OldValCopy = MRI.createVirtualRegister(MRI.getRegClass(OldVal));
- unsigned NewValCopy = MRI.createVirtualRegister(MRI.getRegClass(NewVal));
+ Register PtrCopy = MRI.createVirtualRegister(MRI.getRegClass(Ptr));
+ Register OldValCopy = MRI.createVirtualRegister(MRI.getRegClass(OldVal));
+ Register NewValCopy = MRI.createVirtualRegister(MRI.getRegClass(NewVal));
BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr);
BuildMI(*BB, II, DL, TII->get(Mips::COPY), OldValCopy).addReg(OldVal);
@@ -1790,22 +1788,22 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Dest = MI.getOperand(0).getReg();
- unsigned Ptr = MI.getOperand(1).getReg();
- unsigned CmpVal = MI.getOperand(2).getReg();
- unsigned NewVal = MI.getOperand(3).getReg();
-
- unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp);
- unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
- unsigned Mask = RegInfo.createVirtualRegister(RC);
- unsigned Mask2 = RegInfo.createVirtualRegister(RC);
- unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
- unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
- unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
- unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
- unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
- unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
- unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
+ Register Dest = MI.getOperand(0).getReg();
+ Register Ptr = MI.getOperand(1).getReg();
+ Register CmpVal = MI.getOperand(2).getReg();
+ Register NewVal = MI.getOperand(3).getReg();
+
+ Register AlignedAddr = RegInfo.createVirtualRegister(RCp);
+ Register ShiftAmt = RegInfo.createVirtualRegister(RC);
+ Register Mask = RegInfo.createVirtualRegister(RC);
+ Register Mask2 = RegInfo.createVirtualRegister(RC);
+ Register ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
+ Register ShiftedNewVal = RegInfo.createVirtualRegister(RC);
+ Register MaskLSB2 = RegInfo.createVirtualRegister(RCp);
+ Register PtrLSB2 = RegInfo.createVirtualRegister(RC);
+ Register MaskUpper = RegInfo.createVirtualRegister(RC);
+ Register MaskedCmpVal = RegInfo.createVirtualRegister(RC);
+ Register MaskedNewVal = RegInfo.createVirtualRegister(RC);
unsigned AtomicOp = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I8
? Mips::ATOMIC_CMP_SWAP_I8_POSTRA
: Mips::ATOMIC_CMP_SWAP_I16_POSTRA;
@@ -1820,8 +1818,8 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
// value isn't a problem.
// The Dead flag is needed as the value in scratch isn't used by any other
// instruction. Kill isn't used as Dead is more precise.
- unsigned Scratch = RegInfo.createVirtualRegister(RC);
- unsigned Scratch2 = RegInfo.createVirtualRegister(RC);
+ Register Scratch = RegInfo.createVirtualRegister(RC);
+ Register Scratch2 = RegInfo.createVirtualRegister(RC);
// insert new blocks after the current block
const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -1859,7 +1857,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
if (Subtarget.isLittle()) {
BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
} else {
- unsigned Off = RegInfo.createVirtualRegister(RC);
+ Register Off = RegInfo.createVirtualRegister(RC);
BuildMI(BB, DL, TII->get(Mips::XORi), Off)
.addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
@@ -1967,10 +1965,10 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
// %gp_rel relocation
return getAddrGPRel(N, SDLoc(N), Ty, DAG, ABI.IsN64());
- // %hi/%lo relocation
+ // %hi/%lo relocation
return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
- // %highest/%higher/%hi/%lo relocation
- : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
+ // %highest/%higher/%hi/%lo relocation
+ : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
}
// Every other architecture would use shouldAssumeDSOLocal in here, but
@@ -1987,7 +1985,7 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
if (GV->hasLocalLinkage())
return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
- if (LargeGOT)
+ if (Subtarget.useXGOT())
return getAddrGlobalLargeGOT(
N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16, MipsII::MO_GOT_LO16,
DAG.getEntryNode(),
@@ -2149,7 +2147,8 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Node->getValueType(0);
SDValue Chain = Node->getOperand(0);
SDValue VAListPtr = Node->getOperand(1);
- unsigned Align = Node->getConstantOperandVal(3);
+ const Align Align =
+ llvm::MaybeAlign(Node->getConstantOperandVal(3)).valueOrOne();
const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
SDLoc DL(Node);
unsigned ArgSlotSizeInBytes = (ABI.IsN32() || ABI.IsN64()) ? 8 : 4;
@@ -2166,14 +2165,13 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
// when the pointer is still aligned from the last va_arg (or pair of
// va_args for the i64 on O32 case).
if (Align > getMinStackArgumentAlignment()) {
- assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+ VAList = DAG.getNode(
+ ISD::ADD, DL, VAList.getValueType(), VAList,
+ DAG.getConstant(Align.value() - 1, DL, VAList.getValueType()));
- VAList = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
- DAG.getConstant(Align - 1, DL, VAList.getValueType()));
-
- VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
- DAG.getConstant(-(int64_t)Align, DL,
- VAList.getValueType()));
+ VAList = DAG.getNode(
+ ISD::AND, DL, VAList.getValueType(), VAList,
+ DAG.getConstant(-(int64_t)Align.value(), DL, VAList.getValueType()));
}
// Increment the pointer, VAList, to the next vaarg.
@@ -2870,7 +2868,7 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
#include "MipsGenCallingConv.inc"
CCAssignFn *MipsTargetLowering::CCAssignFnForCall() const{
- return CC_Mips;
+ return CC_Mips_FixedArg;
}
CCAssignFn *MipsTargetLowering::CCAssignFnForReturn() const{
@@ -3167,7 +3165,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Arg, DAG.getConstant(1, DL, MVT::i32));
if (!Subtarget.isLittle())
std::swap(Lo, Hi);
- unsigned LocRegLo = VA.getLocReg();
+ Register LocRegLo = VA.getLocReg();
unsigned LocRegHigh = getNextIntArgReg(LocRegLo);
RegsToPass.push_back(std::make_pair(LocRegLo, Lo));
RegsToPass.push_back(std::make_pair(LocRegHigh, Hi));
@@ -3270,7 +3268,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (InternalLinkage)
Callee = getAddrLocal(G, DL, Ty, DAG, ABI.IsN32() || ABI.IsN64());
- else if (LargeGOT) {
+ else if (Subtarget.useXGOT()) {
Callee = getAddrGlobalLargeGOT(G, DL, Ty, DAG, MipsII::MO_CALL_HI16,
MipsII::MO_CALL_LO16, Chain,
FuncInfo->callPtrInfo(Val));
@@ -3292,7 +3290,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (!IsPIC) // static
Callee = DAG.getTargetExternalSymbol(
Sym, getPointerTy(DAG.getDataLayout()), MipsII::MO_NO_FLAG);
- else if (LargeGOT) {
+ else if (Subtarget.useXGOT()) {
Callee = getAddrGlobalLargeGOT(S, DL, Ty, DAG, MipsII::MO_CALL_HI16,
MipsII::MO_CALL_LO16, Chain,
FuncInfo->callPtrInfo(Sym));
@@ -3523,7 +3521,7 @@ SDValue MipsTargetLowering::LowerFormalArguments(
// Arguments stored on registers
if (IsRegLoc) {
MVT RegVT = VA.getLocVT();
- unsigned ArgReg = VA.getLocReg();
+ Register ArgReg = VA.getLocReg();
const TargetRegisterClass *RC = getRegClassFor(RegVT);
// Transform the arguments stored on
@@ -4568,20 +4566,20 @@ MachineBasicBlock *MipsTargetLowering::emitPseudoD_SELECT(MachineInstr &MI,
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-unsigned MipsTargetLowering::getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const {
+Register MipsTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const {
// Named registers is expected to be fairly rare. For now, just support $28
// since the linux kernel uses it.
if (Subtarget.isGP64bit()) {
- unsigned Reg = StringSwitch<unsigned>(RegName)
+ Register Reg = StringSwitch<Register>(RegName)
.Case("$28", Mips::GP_64)
- .Default(0);
+ .Default(Register());
if (Reg)
return Reg;
} else {
- unsigned Reg = StringSwitch<unsigned>(RegName)
+ Register Reg = StringSwitch<Register>(RegName)
.Case("$28", Mips::GP)
- .Default(0);
+ .Default(Register());
if (Reg)
return Reg;
}
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 2db60e9801f1..0a5cddd45afb 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -304,11 +304,12 @@ class TargetRegisterClass;
unsigned &NumIntermediates, MVT &RegisterVT) const override;
/// Return the correct alignment for the current calling convention.
- unsigned getABIAlignmentForCallingConv(Type *ArgTy,
- DataLayout DL) const override {
+ Align getABIAlignmentForCallingConv(Type *ArgTy,
+ DataLayout DL) const override {
+ const Align ABIAlign(DL.getABITypeAlignment(ArgTy));
if (ArgTy->isVectorTy())
- return std::min(DL.getABITypeAlignment(ArgTy), 8U);
- return DL.getABITypeAlignment(ArgTy);
+ return std::min(ABIAlign, Align(8));
+ return ABIAlign;
}
ISD::NodeType getExtendForAtomicOps() const override {
@@ -347,8 +348,8 @@ class TargetRegisterClass;
void HandleByVal(CCState *, unsigned &, unsigned) const override;
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index fbd56206b249..6bb25ee5754d 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -677,7 +677,8 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
return MIB;
}
-bool MipsInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+bool MipsInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {
assert(!MI.isBundle() &&
"TargetInstrInfo::findCommutedOpIndices() can't handle bundles");
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index a626c0c3fdb8..092a960b4ba7 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -148,7 +148,7 @@ public:
MachineInstrBuilder genInstrWithNewOpc(unsigned NewOpc,
MachineBasicBlock::iterator I) const;
- bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+ bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
/// Perform target specific instruction verification.
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index a4e85a38ab28..58167e0f344d 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -211,9 +211,9 @@ def HasCnMips : Predicate<"Subtarget->hasCnMips()">,
AssemblerPredicate<"FeatureCnMips">;
def NotCnMips : Predicate<"!Subtarget->hasCnMips()">,
AssemblerPredicate<"!FeatureCnMips">;
-def IsSym32 : Predicate<"Subtarget->HasSym32()">,
+def IsSym32 : Predicate<"Subtarget->hasSym32()">,
AssemblerPredicate<"FeatureSym32">;
-def IsSym64 : Predicate<"!Subtarget->HasSym32()">,
+def IsSym64 : Predicate<"!Subtarget->hasSym32()">,
AssemblerPredicate<"!FeatureSym32">;
def IsN64 : Predicate<"Subtarget->isABI_N64()">;
def IsNotN64 : Predicate<"!Subtarget->isABI_N64()">;
@@ -1263,6 +1263,7 @@ def immSExt16 : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
// Node immediate fits as 7-bit zero extended on target immediate.
def immZExt7 : PatLeaf<(imm), [{ return isUInt<7>(N->getZExtValue()); }]>;
+def timmZExt7 : PatLeaf<(timm), [{ return isUInt<7>(N->getZExtValue()); }]>;
// Node immediate fits as 16-bit zero extended on target immediate.
// The LO16 param means that only the lower 16 bits of the node
@@ -1295,6 +1296,7 @@ def immZExt32 : PatLeaf<(imm), [{ return isUInt<32>(N->getZExtValue()); }]>;
// shamt field must fit in 5 bits.
def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
+def timmZExt5 : TImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
def immZExt5Plus1 : PatLeaf<(imm), [{
return isUInt<5>(N->getZExtValue() - 1);
@@ -3142,25 +3144,31 @@ multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
def : MipsPat<(MipsHi tconstpool:$in), (Lui tconstpool:$in)>;
def : MipsPat<(MipsHi texternalsym:$in), (Lui texternalsym:$in)>;
- def : MipsPat<(MipsLo tglobaladdr:$in), (Addiu ZeroReg, tglobaladdr:$in)>;
+ def : MipsPat<(MipsLo tglobaladdr:$in),
+ (Addiu ZeroReg, tglobaladdr:$in)>;
def : MipsPat<(MipsLo tblockaddress:$in),
(Addiu ZeroReg, tblockaddress:$in)>;
- def : MipsPat<(MipsLo tjumptable:$in), (Addiu ZeroReg, tjumptable:$in)>;
- def : MipsPat<(MipsLo tconstpool:$in), (Addiu ZeroReg, tconstpool:$in)>;
+ def : MipsPat<(MipsLo tjumptable:$in),
+ (Addiu ZeroReg, tjumptable:$in)>;
+ def : MipsPat<(MipsLo tconstpool:$in),
+ (Addiu ZeroReg, tconstpool:$in)>;
def : MipsPat<(MipsLo tglobaltlsaddr:$in),
(Addiu ZeroReg, tglobaltlsaddr:$in)>;
- def : MipsPat<(MipsLo texternalsym:$in), (Addiu ZeroReg, texternalsym:$in)>;
+ def : MipsPat<(MipsLo texternalsym:$in),
+ (Addiu ZeroReg, texternalsym:$in)>;
def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaladdr:$lo)),
- (Addiu GPROpnd:$hi, tglobaladdr:$lo)>;
+ (Addiu GPROpnd:$hi, tglobaladdr:$lo)>;
def : MipsPat<(add GPROpnd:$hi, (MipsLo tblockaddress:$lo)),
- (Addiu GPROpnd:$hi, tblockaddress:$lo)>;
+ (Addiu GPROpnd:$hi, tblockaddress:$lo)>;
def : MipsPat<(add GPROpnd:$hi, (MipsLo tjumptable:$lo)),
- (Addiu GPROpnd:$hi, tjumptable:$lo)>;
+ (Addiu GPROpnd:$hi, tjumptable:$lo)>;
def : MipsPat<(add GPROpnd:$hi, (MipsLo tconstpool:$lo)),
- (Addiu GPROpnd:$hi, tconstpool:$lo)>;
+ (Addiu GPROpnd:$hi, tconstpool:$lo)>;
def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaltlsaddr:$lo)),
- (Addiu GPROpnd:$hi, tglobaltlsaddr:$lo)>;
+ (Addiu GPROpnd:$hi, tglobaltlsaddr:$lo)>;
+ def : MipsPat<(add GPROpnd:$hi, (MipsLo texternalsym:$lo)),
+ (Addiu GPROpnd:$hi, texternalsym:$lo)>;
}
// wrapper_pic
diff --git a/lib/Target/Mips/MipsInstructionSelector.cpp b/lib/Target/Mips/MipsInstructionSelector.cpp
index 45a47ad3c087..f8fc7cb0898b 100644
--- a/lib/Target/Mips/MipsInstructionSelector.cpp
+++ b/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -17,6 +17,7 @@
#include "MipsTargetMachine.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
#define DEBUG_TYPE "mips-isel"
@@ -33,7 +34,7 @@ public:
MipsInstructionSelector(const MipsTargetMachine &TM, const MipsSubtarget &STI,
const MipsRegisterBankInfo &RBI);
- bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ bool select(MachineInstr &I) override;
static const char *getName() { return DEBUG_TYPE; }
private:
@@ -44,6 +45,8 @@ private:
const TargetRegisterClass *
getRegClassForTypeOnBank(unsigned OpSize, const RegisterBank &RB,
const RegisterBankInfo &RBI) const;
+ unsigned selectLoadStoreOpCode(MachineInstr &I,
+ MachineRegisterInfo &MRI) const;
const MipsTargetMachine &TM;
const MipsSubtarget &STI;
@@ -84,7 +87,7 @@ MipsInstructionSelector::MipsInstructionSelector(
bool MipsInstructionSelector::selectCopy(MachineInstr &I,
MachineRegisterInfo &MRI) const {
Register DstReg = I.getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ if (Register::isPhysicalRegister(DstReg))
return true;
const RegisterBank *RegBank = RBI.getRegBank(DstReg, MRI, TRI);
@@ -158,9 +161,15 @@ bool MipsInstructionSelector::materialize32BitImm(Register DestReg, APInt Imm,
}
/// Returning Opc indicates that we failed to select MIPS instruction opcode.
-static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned MemSizeInBytes,
- unsigned RegBank, bool isFP64) {
- bool isStore = Opc == TargetOpcode::G_STORE;
+unsigned
+MipsInstructionSelector::selectLoadStoreOpCode(MachineInstr &I,
+ MachineRegisterInfo &MRI) const {
+ STI.getRegisterInfo();
+ const Register DestReg = I.getOperand(0).getReg();
+ const unsigned RegBank = RBI.getRegBank(DestReg, MRI, TRI)->getID();
+ const unsigned MemSizeInBytes = (*I.memoperands_begin())->getSize();
+ unsigned Opc = I.getOpcode();
+ const bool isStore = Opc == TargetOpcode::G_STORE;
if (RegBank == Mips::GPRBRegBankID) {
if (isStore)
switch (MemSizeInBytes) {
@@ -192,10 +201,24 @@ static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned MemSizeInBytes,
case 4:
return isStore ? Mips::SWC1 : Mips::LWC1;
case 8:
- if (isFP64)
+ if (STI.isFP64bit())
return isStore ? Mips::SDC164 : Mips::LDC164;
else
return isStore ? Mips::SDC1 : Mips::LDC1;
+ case 16: {
+ assert(STI.hasMSA() && "Vector instructions require target with MSA.");
+ const unsigned VectorElementSizeInBytes =
+ MRI.getType(DestReg).getElementType().getSizeInBytes();
+ if (VectorElementSizeInBytes == 1)
+ return isStore ? Mips::ST_B : Mips::LD_B;
+ if (VectorElementSizeInBytes == 2)
+ return isStore ? Mips::ST_H : Mips::LD_H;
+ if (VectorElementSizeInBytes == 4)
+ return isStore ? Mips::ST_W : Mips::LD_W;
+ if (VectorElementSizeInBytes == 8)
+ return isStore ? Mips::ST_D : Mips::LD_D;
+ return Opc;
+ }
default:
return Opc;
}
@@ -203,8 +226,7 @@ static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned MemSizeInBytes,
return Opc;
}
-bool MipsInstructionSelector::select(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
+bool MipsInstructionSelector::select(MachineInstr &I) {
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
@@ -231,7 +253,7 @@ bool MipsInstructionSelector::select(MachineInstr &I,
return true;
}
- if (selectImpl(I, CoverageInfo))
+ if (selectImpl(I, *CoverageInfo))
return true;
MachineInstr *MI = nullptr;
@@ -265,6 +287,11 @@ bool MipsInstructionSelector::select(MachineInstr &I,
.add(I.getOperand(2));
break;
}
+ case G_INTTOPTR:
+ case G_PTRTOINT: {
+ I.setDesc(TII.get(COPY));
+ return selectCopy(I, MRI);
+ }
case G_FRAME_INDEX: {
MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu))
.add(I.getOperand(0))
@@ -279,12 +306,71 @@ bool MipsInstructionSelector::select(MachineInstr &I,
.add(I.getOperand(1));
break;
}
+ case G_BRJT: {
+ unsigned EntrySize =
+ MF.getJumpTableInfo()->getEntrySize(MF.getDataLayout());
+ assert(isPowerOf2_32(EntrySize) &&
+ "Non-power-of-two jump-table entry size not supported.");
+
+ Register JTIndex = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ MachineInstr *SLL = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::SLL))
+ .addDef(JTIndex)
+ .addUse(I.getOperand(2).getReg())
+ .addImm(Log2_32(EntrySize));
+ if (!constrainSelectedInstRegOperands(*SLL, TII, TRI, RBI))
+ return false;
+
+ Register DestAddress = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ MachineInstr *ADDu = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDu))
+ .addDef(DestAddress)
+ .addUse(I.getOperand(0).getReg())
+ .addUse(JTIndex);
+ if (!constrainSelectedInstRegOperands(*ADDu, TII, TRI, RBI))
+ return false;
+
+ Register Dest = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ MachineInstr *LW =
+ BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LW))
+ .addDef(Dest)
+ .addUse(DestAddress)
+ .addJumpTableIndex(I.getOperand(1).getIndex(), MipsII::MO_ABS_LO)
+ .addMemOperand(MF.getMachineMemOperand(
+ MachinePointerInfo(), MachineMemOperand::MOLoad, 4, 4));
+ if (!constrainSelectedInstRegOperands(*LW, TII, TRI, RBI))
+ return false;
+
+ if (MF.getTarget().isPositionIndependent()) {
+ Register DestTmp = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ LW->getOperand(0).setReg(DestTmp);
+ MachineInstr *ADDu = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDu))
+ .addDef(Dest)
+ .addUse(DestTmp)
+ .addUse(MF.getInfo<MipsFunctionInfo>()
+ ->getGlobalBaseRegForGlobalISel());
+ if (!constrainSelectedInstRegOperands(*ADDu, TII, TRI, RBI))
+ return false;
+ }
+
+ MachineInstr *Branch =
+ BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::PseudoIndirectBranch))
+ .addUse(Dest);
+ if (!constrainSelectedInstRegOperands(*Branch, TII, TRI, RBI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+ }
+ case G_BRINDIRECT: {
+ MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::PseudoIndirectBranch))
+ .add(I.getOperand(0));
+ break;
+ }
case G_PHI: {
const Register DestReg = I.getOperand(0).getReg();
const unsigned OpSize = MRI.getType(DestReg).getSizeInBits();
const TargetRegisterClass *DefRC = nullptr;
- if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+ if (Register::isPhysicalRegister(DestReg))
DefRC = TRI.getRegClass(DestReg);
else
DefRC = getRegClassForTypeOnBank(OpSize,
@@ -297,26 +383,35 @@ bool MipsInstructionSelector::select(MachineInstr &I,
case G_LOAD:
case G_ZEXTLOAD:
case G_SEXTLOAD: {
- const Register DestReg = I.getOperand(0).getReg();
- const unsigned DestRegBank = RBI.getRegBank(DestReg, MRI, TRI)->getID();
- const unsigned OpSize = MRI.getType(DestReg).getSizeInBits();
- const unsigned OpMemSizeInBytes = (*I.memoperands_begin())->getSize();
-
- if (DestRegBank == Mips::GPRBRegBankID && OpSize != 32)
- return false;
-
- if (DestRegBank == Mips::FPRBRegBankID && OpSize != 32 && OpSize != 64)
- return false;
-
- const unsigned NewOpc = selectLoadStoreOpCode(
- I.getOpcode(), OpMemSizeInBytes, DestRegBank, STI.isFP64bit());
+ const unsigned NewOpc = selectLoadStoreOpCode(I, MRI);
if (NewOpc == I.getOpcode())
return false;
+ MachineOperand BaseAddr = I.getOperand(1);
+ int64_t SignedOffset = 0;
+ // Try to fold load/store + G_GEP + G_CONSTANT
+ // %SignedOffset:(s32) = G_CONSTANT i32 16_bit_signed_immediate
+ // %Addr:(p0) = G_GEP %BaseAddr, %SignedOffset
+ // %LoadResult/%StoreSrc = load/store %Addr(p0)
+ // into:
+ // %LoadResult/%StoreSrc = NewOpc %BaseAddr(p0), 16_bit_signed_immediate
+
+ MachineInstr *Addr = MRI.getVRegDef(I.getOperand(1).getReg());
+ if (Addr->getOpcode() == G_GEP) {
+ MachineInstr *Offset = MRI.getVRegDef(Addr->getOperand(2).getReg());
+ if (Offset->getOpcode() == G_CONSTANT) {
+ APInt OffsetValue = Offset->getOperand(1).getCImm()->getValue();
+ if (OffsetValue.isSignedIntN(16)) {
+ BaseAddr = Addr->getOperand(1);
+ SignedOffset = OffsetValue.getSExtValue();
+ }
+ }
+ }
+
MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
.add(I.getOperand(0))
- .add(I.getOperand(1))
- .addImm(0)
+ .add(BaseAddr)
+ .addImm(SignedOffset)
.addMemOperand(*I.memoperands_begin());
break;
}
@@ -356,6 +451,18 @@ bool MipsInstructionSelector::select(MachineInstr &I,
.add(I.getOperand(3));
break;
}
+ case G_IMPLICIT_DEF: {
+ MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::IMPLICIT_DEF))
+ .add(I.getOperand(0));
+
+ // Set class based on register bank, there can be fpr and gpr implicit def.
+ MRI.setRegClass(MI->getOperand(0).getReg(),
+ getRegClassForTypeOnBank(
+ MRI.getType(I.getOperand(0).getReg()).getSizeInBits(),
+ *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI),
+ RBI));
+ break;
+ }
case G_CONSTANT: {
MachineIRBuilder B(I);
if (!materialize32BitImm(I.getOperand(0).getReg(),
@@ -423,7 +530,7 @@ bool MipsInstructionSelector::select(MachineInstr &I,
Opcode = Mips::TRUNC_W_S;
else
Opcode = STI.isFP64bit() ? Mips::TRUNC_W_D64 : Mips::TRUNC_W_D32;
- unsigned ResultInFPR = MRI.createVirtualRegister(&Mips::FGR32RegClass);
+ Register ResultInFPR = MRI.createVirtualRegister(&Mips::FGR32RegClass);
MachineInstr *Trunc = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Opcode))
.addDef(ResultInFPR)
.addUse(I.getOperand(1).getReg());
@@ -496,6 +603,24 @@ bool MipsInstructionSelector::select(MachineInstr &I,
I.eraseFromParent();
return true;
}
+ case G_JUMP_TABLE: {
+ if (MF.getTarget().isPositionIndependent()) {
+ MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LW))
+ .addDef(I.getOperand(0).getReg())
+ .addReg(MF.getInfo<MipsFunctionInfo>()
+ ->getGlobalBaseRegForGlobalISel())
+ .addJumpTableIndex(I.getOperand(1).getIndex(), MipsII::MO_GOT)
+ .addMemOperand(
+ MF.getMachineMemOperand(MachinePointerInfo::getGOT(MF),
+ MachineMemOperand::MOLoad, 4, 4));
+ } else {
+ MI =
+ BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi))
+ .addDef(I.getOperand(0).getReg())
+ .addJumpTableIndex(I.getOperand(1).getIndex(), MipsII::MO_ABS_HI);
+ }
+ break;
+ }
case G_ICMP: {
struct Instr {
unsigned Opcode;
@@ -626,7 +751,7 @@ bool MipsInstructionSelector::select(MachineInstr &I,
// MipsFCMPCondCode, result is inverted i.e. MOVT_I is used.
unsigned MoveOpcode = isLogicallyNegated ? Mips::MOVT_I : Mips::MOVF_I;
- unsigned TrueInReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ Register TrueInReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu))
.addDef(TrueInReg)
.addUse(Mips::ZERO)
@@ -654,6 +779,33 @@ bool MipsInstructionSelector::select(MachineInstr &I,
I.eraseFromParent();
return true;
}
+ case G_FENCE: {
+ MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::SYNC)).addImm(0);
+ break;
+ }
+ case G_VASTART: {
+ MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
+ int FI = FuncInfo->getVarArgsFrameIndex();
+
+ Register LeaReg = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+ MachineInstr *LEA_ADDiu =
+ BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LEA_ADDiu))
+ .addDef(LeaReg)
+ .addFrameIndex(FI)
+ .addImm(0);
+ if (!constrainSelectedInstRegOperands(*LEA_ADDiu, TII, TRI, RBI))
+ return false;
+
+ MachineInstr *Store = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::SW))
+ .addUse(LeaReg)
+ .addUse(I.getOperand(0).getReg())
+ .addImm(0);
+ if (!constrainSelectedInstRegOperands(*Store, TII, TRI, RBI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+ }
default:
return false;
}
diff --git a/lib/Target/Mips/MipsLegalizerInfo.cpp b/lib/Target/Mips/MipsLegalizerInfo.cpp
index e442a81837ed..bb4a1d902d75 100644
--- a/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -16,18 +16,65 @@
using namespace llvm;
+struct TypesAndMemOps {
+ LLT ValTy;
+ LLT PtrTy;
+ unsigned MemSize;
+ bool MustBeNaturallyAligned;
+};
+
+static bool
+CheckTy0Ty1MemSizeAlign(const LegalityQuery &Query,
+ std::initializer_list<TypesAndMemOps> SupportedValues) {
+ for (auto &Val : SupportedValues) {
+ if (Val.ValTy != Query.Types[0])
+ continue;
+ if (Val.PtrTy != Query.Types[1])
+ continue;
+ if (Val.MemSize != Query.MMODescrs[0].SizeInBits)
+ continue;
+ if (Val.MustBeNaturallyAligned &&
+ Query.MMODescrs[0].SizeInBits % Query.MMODescrs[0].AlignInBits != 0)
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static bool CheckTyN(unsigned N, const LegalityQuery &Query,
+ std::initializer_list<LLT> SupportedValues) {
+ for (auto &Val : SupportedValues)
+ if (Val == Query.Types[N])
+ return true;
+ return false;
+}
+
MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
using namespace TargetOpcode;
const LLT s1 = LLT::scalar(1);
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
+ const LLT v16s8 = LLT::vector(16, 8);
+ const LLT v8s16 = LLT::vector(8, 16);
+ const LLT v4s32 = LLT::vector(4, 32);
+ const LLT v2s64 = LLT::vector(2, 64);
const LLT p0 = LLT::pointer(0, 32);
- getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+ getActionDefinitionsBuilder({G_SUB, G_MUL})
.legalFor({s32})
.clampScalar(0, s32, s32);
+ getActionDefinitionsBuilder(G_ADD)
+ .legalIf([=, &ST](const LegalityQuery &Query) {
+ if (CheckTyN(0, Query, {s32}))
+ return true;
+ if (ST.hasMSA() && CheckTyN(0, Query, {v16s8, v8s16, v4s32, v2s64}))
+ return true;
+ return false;
+ })
+ .clampScalar(0, s32, s32);
+
getActionDefinitionsBuilder({G_UADDO, G_UADDE, G_USUBO, G_USUBE, G_UMULO})
.lowerFor({{s32, s1}});
@@ -36,13 +83,26 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
.maxScalar(0, s32);
getActionDefinitionsBuilder({G_LOAD, G_STORE})
- .legalForTypesWithMemDesc({{s32, p0, 8, 8},
- {s32, p0, 16, 8},
- {s32, p0, 32, 8},
- {s64, p0, 64, 8},
- {p0, p0, 32, 8}})
+ .legalIf([=, &ST](const LegalityQuery &Query) {
+ if (CheckTy0Ty1MemSizeAlign(Query, {{s32, p0, 8, ST.hasMips32r6()},
+ {s32, p0, 16, ST.hasMips32r6()},
+ {s32, p0, 32, ST.hasMips32r6()},
+ {p0, p0, 32, ST.hasMips32r6()},
+ {s64, p0, 64, ST.hasMips32r6()}}))
+ return true;
+ if (ST.hasMSA() &&
+ CheckTy0Ty1MemSizeAlign(Query, {{v16s8, p0, 128, false},
+ {v8s16, p0, 128, false},
+ {v4s32, p0, 128, false},
+ {v2s64, p0, 128, false}}))
+ return true;
+ return false;
+ })
.minScalar(0, s32);
+ getActionDefinitionsBuilder(G_IMPLICIT_DEF)
+ .legalFor({s32, s64});
+
getActionDefinitionsBuilder(G_UNMERGE_VALUES)
.legalFor({{s32, s64}});
@@ -50,9 +110,17 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
.legalFor({{s64, s32}});
getActionDefinitionsBuilder({G_ZEXTLOAD, G_SEXTLOAD})
- .legalForTypesWithMemDesc({{s32, p0, 8, 8},
- {s32, p0, 16, 8}})
- .minScalar(0, s32);
+ .legalForTypesWithMemDesc({{s32, p0, 8, 8},
+ {s32, p0, 16, 8}})
+ .clampScalar(0, s32, s32);
+
+ getActionDefinitionsBuilder({G_ZEXT, G_SEXT})
+ .legalIf([](const LegalityQuery &Query) { return false; })
+ .maxScalar(0, s32);
+
+ getActionDefinitionsBuilder(G_TRUNC)
+ .legalIf([](const LegalityQuery &Query) { return false; })
+ .maxScalar(1, s32);
getActionDefinitionsBuilder(G_SELECT)
.legalForCartesianProduct({p0, s32, s64}, {s32})
@@ -63,6 +131,12 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
.legalFor({s32})
.minScalar(0, s32);
+ getActionDefinitionsBuilder(G_BRJT)
+ .legalFor({{p0, s32}});
+
+ getActionDefinitionsBuilder(G_BRINDIRECT)
+ .legalFor({p0});
+
getActionDefinitionsBuilder(G_PHI)
.legalFor({p0, s32, s64})
.minScalar(0, s32);
@@ -77,8 +151,9 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
.libcallFor({s64});
getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
- .legalFor({s32, s32})
- .minScalar(1, s32);
+ .legalFor({{s32, s32}})
+ .clampScalar(1, s32, s32)
+ .clampScalar(0, s32, s32);
getActionDefinitionsBuilder(G_ICMP)
.legalForCartesianProduct({s32}, {s32, p0})
@@ -89,15 +164,24 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
.legalFor({s32})
.clampScalar(0, s32, s32);
- getActionDefinitionsBuilder(G_GEP)
+ getActionDefinitionsBuilder({G_GEP, G_INTTOPTR})
.legalFor({{p0, s32}});
+ getActionDefinitionsBuilder(G_PTRTOINT)
+ .legalFor({{s32, p0}});
+
getActionDefinitionsBuilder(G_FRAME_INDEX)
.legalFor({p0});
- getActionDefinitionsBuilder(G_GLOBAL_VALUE)
+ getActionDefinitionsBuilder({G_GLOBAL_VALUE, G_JUMP_TABLE})
.legalFor({p0});
+ getActionDefinitionsBuilder(G_DYN_STACKALLOC)
+ .lowerFor({{p0, s32}});
+
+ getActionDefinitionsBuilder(G_VASTART)
+ .legalFor({p0});
+
// FP instructions
getActionDefinitionsBuilder(G_FCONSTANT)
.legalFor({s32, s64});
@@ -126,6 +210,7 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
getActionDefinitionsBuilder(G_FPTOUI)
.libcallForCartesianProduct({s64}, {s64, s32})
+ .lowerForCartesianProduct({s32}, {s64, s32})
.minScalar(0, s32);
// Int to FP conversion instructions
@@ -136,8 +221,11 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
getActionDefinitionsBuilder(G_UITOFP)
.libcallForCartesianProduct({s64, s32}, {s64})
+ .customForCartesianProduct({s64, s32}, {s32})
.minScalar(1, s32);
+ getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+
computeTables();
verify(*ST.getInstrInfo());
}
@@ -150,6 +238,134 @@ bool MipsLegalizerInfo::legalizeCustom(MachineInstr &MI,
using namespace TargetOpcode;
MIRBuilder.setInstr(MI);
+ const MipsSubtarget &STI =
+ static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
+ const LLT s32 = LLT::scalar(32);
+ const LLT s64 = LLT::scalar(64);
- return false;
+ switch (MI.getOpcode()) {
+ case G_UITOFP: {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Src);
+
+ if (SrcTy != s32)
+ return false;
+ if (DstTy != s32 && DstTy != s64)
+ return false;
+
+ // Let 0xABCDEFGH be given unsigned in MI.getOperand(1). First let's convert
+ // unsigned to double. Mantissa has 52 bits so we use following trick:
+ // First make floating point bit mask 0x43300000ABCDEFGH.
+ // Mask represents 2^52 * 0x1.00000ABCDEFGH i.e. 0x100000ABCDEFGH.0 .
+ // Next, subtract 2^52 * 0x1.0000000000000 i.e. 0x10000000000000.0 from it.
+ // Done. Trunc double to float if needed.
+
+ MachineInstrBuilder Bitcast = MIRBuilder.buildInstr(
+ STI.isFP64bit() ? Mips::BuildPairF64_64 : Mips::BuildPairF64, {s64},
+ {Src, MIRBuilder.buildConstant(s32, UINT32_C(0x43300000))});
+ Bitcast.constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+ *STI.getRegBankInfo());
+
+ MachineInstrBuilder TwoP52FP = MIRBuilder.buildFConstant(
+ s64, BitsToDouble(UINT64_C(0x4330000000000000)));
+
+ if (DstTy == s64)
+ MIRBuilder.buildFSub(Dst, Bitcast, TwoP52FP);
+ else {
+ MachineInstrBuilder ResF64 = MIRBuilder.buildFSub(s64, Bitcast, TwoP52FP);
+ MIRBuilder.buildFPTrunc(Dst, ResF64);
+ }
+
+ MI.eraseFromParent();
+ break;
+ }
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static bool SelectMSA3OpIntrinsic(MachineInstr &MI, unsigned Opcode,
+ MachineIRBuilder &MIRBuilder,
+ const MipsSubtarget &ST) {
+ assert(ST.hasMSA() && "MSA intrinsic not supported on target without MSA.");
+ if (!MIRBuilder.buildInstr(Opcode)
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .constrainAllUses(MIRBuilder.getTII(), *ST.getRegisterInfo(),
+ *ST.getRegBankInfo()))
+ return false;
+ MI.eraseFromParent();
+ return true;
+}
+
+static bool MSA3OpIntrinsicToGeneric(MachineInstr &MI, unsigned Opcode,
+ MachineIRBuilder &MIRBuilder,
+ const MipsSubtarget &ST) {
+ assert(ST.hasMSA() && "MSA intrinsic not supported on target without MSA.");
+ MIRBuilder.buildInstr(Opcode)
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3));
+ MI.eraseFromParent();
+ return true;
+}
+
+bool MipsLegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ const MipsSubtarget &ST =
+ static_cast<const MipsSubtarget &>(MI.getMF()->getSubtarget());
+ const MipsInstrInfo &TII = *ST.getInstrInfo();
+ const MipsRegisterInfo &TRI = *ST.getRegisterInfo();
+ const RegisterBankInfo &RBI = *ST.getRegBankInfo();
+ MIRBuilder.setInstr(MI);
+
+ switch (MI.getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ if (createMemLibcall(MIRBuilder, MRI, MI) ==
+ LegalizerHelper::UnableToLegalize)
+ return false;
+ MI.eraseFromParent();
+ return true;
+ case Intrinsic::trap: {
+ MachineInstr *Trap = MIRBuilder.buildInstr(Mips::TRAP);
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*Trap, TII, TRI, RBI);
+ }
+ case Intrinsic::vacopy: {
+ Register Tmp = MRI.createGenericVirtualRegister(LLT::pointer(0, 32));
+ MachinePointerInfo MPO;
+ MIRBuilder.buildLoad(Tmp, MI.getOperand(2),
+ *MI.getMF()->getMachineMemOperand(
+ MPO, MachineMemOperand::MOLoad, 4, 4));
+ MIRBuilder.buildStore(Tmp, MI.getOperand(1),
+ *MI.getMF()->getMachineMemOperand(
+ MPO, MachineMemOperand::MOStore, 4, 4));
+ MI.eraseFromParent();
+ return true;
+ }
+ case Intrinsic::mips_addv_b:
+ case Intrinsic::mips_addv_h:
+ case Intrinsic::mips_addv_w:
+ case Intrinsic::mips_addv_d:
+ return MSA3OpIntrinsicToGeneric(MI, TargetOpcode::G_ADD, MIRBuilder, ST);
+ case Intrinsic::mips_addvi_b:
+ return SelectMSA3OpIntrinsic(MI, Mips::ADDVI_B, MIRBuilder, ST);
+ case Intrinsic::mips_addvi_h:
+ return SelectMSA3OpIntrinsic(MI, Mips::ADDVI_H, MIRBuilder, ST);
+ case Intrinsic::mips_addvi_w:
+ return SelectMSA3OpIntrinsic(MI, Mips::ADDVI_W, MIRBuilder, ST);
+ case Intrinsic::mips_addvi_d:
+ return SelectMSA3OpIntrinsic(MI, Mips::ADDVI_D, MIRBuilder, ST);
+ default:
+ break;
+ }
+ return true;
}
diff --git a/lib/Target/Mips/MipsLegalizerInfo.h b/lib/Target/Mips/MipsLegalizerInfo.h
index e5021e081890..9696c262b2db 100644
--- a/lib/Target/Mips/MipsLegalizerInfo.h
+++ b/lib/Target/Mips/MipsLegalizerInfo.h
@@ -28,6 +28,9 @@ public:
bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer) const override;
+
+ bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const override;
};
} // end namespace llvm
#endif
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index 907ed9ef746f..f585d9c1a148 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -60,6 +60,11 @@ def immZExt2Ptr : ImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>;
def immZExt3Ptr : ImmLeaf<iPTR, [{return isUInt<3>(Imm);}]>;
def immZExt4Ptr : ImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>;
+def timmZExt1Ptr : TImmLeaf<iPTR, [{return isUInt<1>(Imm);}]>;
+def timmZExt2Ptr : TImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>;
+def timmZExt3Ptr : TImmLeaf<iPTR, [{return isUInt<3>(Imm);}]>;
+def timmZExt4Ptr : TImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>;
+
// Operands
def immZExt2Lsa : ImmLeaf<i32, [{return isUInt<2>(Imm - 1);}]>;
@@ -1270,7 +1275,7 @@ class MSA_I8_SHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
dag OutOperandList = (outs ROWD:$wd);
dag InOperandList = (ins ROWS:$ws, uimm8:$u8);
string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
- list<dag> Pattern = [(set ROWD:$wd, (MipsSHF immZExt8:$u8, ROWS:$ws))];
+ list<dag> Pattern = [(set ROWD:$wd, (MipsSHF timmZExt8:$u8, ROWS:$ws))];
InstrItinClass Itinerary = itin;
}
@@ -2299,13 +2304,13 @@ class INSERT_FW_VIDX64_PSEUDO_DESC :
class INSERT_FD_VIDX64_PSEUDO_DESC :
MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR64Opnd>;
-class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, immZExt4,
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, timmZExt4,
MSA128BOpnd>;
-class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, immZExt3,
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, timmZExt3,
MSA128HOpnd>;
-class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, immZExt2,
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, timmZExt2,
MSA128WOpnd>;
-class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, immZExt1,
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, timmZExt1,
MSA128DOpnd>;
class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -2518,22 +2523,22 @@ class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>;
class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>;
class SAT_S_B_DESC : MSA_BIT_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, uimm3,
- immZExt3, MSA128BOpnd>;
+ timmZExt3, MSA128BOpnd>;
class SAT_S_H_DESC : MSA_BIT_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, uimm4,
- immZExt4, MSA128HOpnd>;
+ timmZExt4, MSA128HOpnd>;
class SAT_S_W_DESC : MSA_BIT_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, uimm5,
- immZExt5, MSA128WOpnd>;
+ timmZExt5, MSA128WOpnd>;
class SAT_S_D_DESC : MSA_BIT_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, uimm6,
- immZExt6, MSA128DOpnd>;
+ timmZExt6, MSA128DOpnd>;
class SAT_U_B_DESC : MSA_BIT_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, uimm3,
- immZExt3, MSA128BOpnd>;
+ timmZExt3, MSA128BOpnd>;
class SAT_U_H_DESC : MSA_BIT_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, uimm4,
- immZExt4, MSA128HOpnd>;
+ timmZExt4, MSA128HOpnd>;
class SAT_U_W_DESC : MSA_BIT_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, uimm5,
- immZExt5, MSA128WOpnd>;
+ timmZExt5, MSA128WOpnd>;
class SAT_U_D_DESC : MSA_BIT_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, uimm6,
- immZExt6, MSA128DOpnd>;
+ timmZExt6, MSA128DOpnd>;
class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>;
class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>;
@@ -2546,16 +2551,16 @@ class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>;
class SLDI_B_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.b", int_mips_sldi_b,
MSA128BOpnd, MSA128BOpnd, uimm4,
- immZExt4>;
+ timmZExt4>;
class SLDI_H_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.h", int_mips_sldi_h,
MSA128HOpnd, MSA128HOpnd, uimm3,
- immZExt3>;
+ timmZExt3>;
class SLDI_W_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.w", int_mips_sldi_w,
MSA128WOpnd, MSA128WOpnd, uimm2,
- immZExt2>;
+ timmZExt2>;
class SLDI_D_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.d", int_mips_sldi_d,
MSA128DOpnd, MSA128DOpnd, uimm1,
- immZExt1>;
+ timmZExt1>;
class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>;
class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>;
@@ -2609,13 +2614,13 @@ class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>;
class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>;
class SRARI_B_DESC : MSA_BIT_X_DESC_BASE<"srari.b", int_mips_srari_b, uimm3,
- immZExt3, MSA128BOpnd>;
+ timmZExt3, MSA128BOpnd>;
class SRARI_H_DESC : MSA_BIT_X_DESC_BASE<"srari.h", int_mips_srari_h, uimm4,
- immZExt4, MSA128HOpnd>;
+ timmZExt4, MSA128HOpnd>;
class SRARI_W_DESC : MSA_BIT_X_DESC_BASE<"srari.w", int_mips_srari_w, uimm5,
- immZExt5, MSA128WOpnd>;
+ timmZExt5, MSA128WOpnd>;
class SRARI_D_DESC : MSA_BIT_X_DESC_BASE<"srari.d", int_mips_srari_d, uimm6,
- immZExt6, MSA128DOpnd>;
+ timmZExt6, MSA128DOpnd>;
class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>;
class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>;
@@ -2637,13 +2642,13 @@ class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>;
class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>;
class SRLRI_B_DESC : MSA_BIT_X_DESC_BASE<"srlri.b", int_mips_srlri_b, uimm3,
- immZExt3, MSA128BOpnd>;
+ timmZExt3, MSA128BOpnd>;
class SRLRI_H_DESC : MSA_BIT_X_DESC_BASE<"srlri.h", int_mips_srlri_h, uimm4,
- immZExt4, MSA128HOpnd>;
+ timmZExt4, MSA128HOpnd>;
class SRLRI_W_DESC : MSA_BIT_X_DESC_BASE<"srlri.w", int_mips_srlri_w, uimm5,
- immZExt5, MSA128WOpnd>;
+ timmZExt5, MSA128WOpnd>;
class SRLRI_D_DESC : MSA_BIT_X_DESC_BASE<"srlri.d", int_mips_srlri_d, uimm6,
- immZExt6, MSA128DOpnd>;
+ timmZExt6, MSA128DOpnd>;
class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
ValueType TyNode, RegisterOperand ROWD,
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index 5ef07a2d283e..8bd64ff6cb27 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -127,8 +127,7 @@ static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) {
MachineOperand &MO = MI.getOperand(0);
- if (!MO.isReg() || !MO.isUse() ||
- !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ if (!MO.isReg() || !MO.isUse() || !Register::isVirtualRegister(MO.getReg()))
return nullptr;
return &MO;
@@ -152,7 +151,7 @@ static void setCallTargetReg(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I) {
MachineFunction &MF = *MBB->getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- unsigned SrcReg = I->getOperand(0).getReg();
+ Register SrcReg = I->getOperand(0).getReg();
unsigned DstReg = getRegTy(SrcReg, MF) == MVT::i32 ? Mips::T9 : Mips::T9_64;
BuildMI(*MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), DstReg)
.addReg(SrcReg);
diff --git a/lib/Target/Mips/MipsPfmCounters.td b/lib/Target/Mips/MipsPfmCounters.td
new file mode 100644
index 000000000000..c7779b474b91
--- /dev/null
+++ b/lib/Target/Mips/MipsPfmCounters.td
@@ -0,0 +1,18 @@
+//===-- MipsPfmCounters.td - Mips Hardware Counters --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the available hardware counters for Mips.
+//
+//===----------------------------------------------------------------------===//
+
+def CpuCyclesPfmCounter : PfmCounter<"CYCLES">;
+
+def DefaultPfmCounters : ProcPfmCounters {
+ let CycleCounter = CpuCyclesPfmCounter;
+}
+def : PfmCountersDefaultBinding<DefaultPfmCounters>;
diff --git a/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
index 85076590d407..ace0735652bd 100644
--- a/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
+++ b/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
@@ -27,7 +27,8 @@ class MipsPreLegalizerCombinerInfo : public CombinerInfo {
public:
MipsPreLegalizerCombinerInfo()
: CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
- /*LegalizerInfo*/ nullptr) {}
+ /*LegalizerInfo*/ nullptr, /*EnableOpt*/ false,
+ /*EnableOptSize*/ false, /*EnableMinSize*/ false) {}
virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
MachineIRBuilder &B) const override;
};
diff --git a/lib/Target/Mips/MipsRegisterBankInfo.cpp b/lib/Target/Mips/MipsRegisterBankInfo.cpp
index d8bcf16afd50..d334366e727c 100644
--- a/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -12,6 +12,7 @@
#include "MipsRegisterBankInfo.h"
#include "MipsInstrInfo.h"
+#include "MipsTargetMachine.h"
#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
#include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
@@ -27,20 +28,23 @@ enum PartialMappingIdx {
PMI_GPR,
PMI_SPR,
PMI_DPR,
+ PMI_MSA,
PMI_Min = PMI_GPR,
};
RegisterBankInfo::PartialMapping PartMappings[]{
{0, 32, GPRBRegBank},
{0, 32, FPRBRegBank},
- {0, 64, FPRBRegBank}
+ {0, 64, FPRBRegBank},
+ {0, 128, FPRBRegBank}
};
enum ValueMappingIdx {
InvalidIdx = 0,
GPRIdx = 1,
SPRIdx = 4,
- DPRIdx = 7
+ DPRIdx = 7,
+ MSAIdx = 10
};
RegisterBankInfo::ValueMapping ValueMappings[] = {
@@ -50,14 +54,18 @@ RegisterBankInfo::ValueMapping ValueMappings[] = {
{&PartMappings[PMI_GPR - PMI_Min], 1},
{&PartMappings[PMI_GPR - PMI_Min], 1},
{&PartMappings[PMI_GPR - PMI_Min], 1},
- // up to 3 ops operands FPRs - single precission
+ // up to 3 operands in FPRs - single precission
{&PartMappings[PMI_SPR - PMI_Min], 1},
{&PartMappings[PMI_SPR - PMI_Min], 1},
{&PartMappings[PMI_SPR - PMI_Min], 1},
- // up to 3 ops operands FPRs - double precission
+ // up to 3 operands in FPRs - double precission
{&PartMappings[PMI_DPR - PMI_Min], 1},
{&PartMappings[PMI_DPR - PMI_Min], 1},
- {&PartMappings[PMI_DPR - PMI_Min], 1}
+ {&PartMappings[PMI_DPR - PMI_Min], 1},
+ // up to 3 operands in FPRs - MSA
+ {&PartMappings[PMI_MSA - PMI_Min], 1},
+ {&PartMappings[PMI_MSA - PMI_Min], 1},
+ {&PartMappings[PMI_MSA - PMI_Min], 1}
};
} // end namespace Mips
@@ -86,6 +94,10 @@ const RegisterBank &MipsRegisterBankInfo::getRegBankFromRegClass(
case Mips::FGR32RegClassID:
case Mips::FGR64RegClassID:
case Mips::AFGR64RegClassID:
+ case Mips::MSA128BRegClassID:
+ case Mips::MSA128HRegClassID:
+ case Mips::MSA128WRegClassID:
+ case Mips::MSA128DRegClassID:
return getRegBank(Mips::FPRBRegBankID);
default:
llvm_unreachable("Register class not supported");
@@ -149,6 +161,7 @@ static bool isAmbiguous(unsigned Opc) {
case TargetOpcode::G_STORE:
case TargetOpcode::G_PHI:
case TargetOpcode::G_SELECT:
+ case TargetOpcode::G_IMPLICIT_DEF:
return true;
default:
return false;
@@ -163,8 +176,7 @@ void MipsRegisterBankInfo::AmbiguousRegDefUseContainer::addDefUses(
MachineInstr *NonCopyInstr = skipCopiesOutgoing(&UseMI);
// Copy with many uses.
if (NonCopyInstr->getOpcode() == TargetOpcode::COPY &&
- !TargetRegisterInfo::isPhysicalRegister(
- NonCopyInstr->getOperand(0).getReg()))
+ !Register::isPhysicalRegister(NonCopyInstr->getOperand(0).getReg()))
addDefUses(NonCopyInstr->getOperand(0).getReg(), MRI);
else
DefUses.push_back(skipCopiesOutgoing(&UseMI));
@@ -186,7 +198,7 @@ MipsRegisterBankInfo::AmbiguousRegDefUseContainer::skipCopiesOutgoing(
const MachineRegisterInfo &MRI = MF.getRegInfo();
MachineInstr *Ret = MI;
while (Ret->getOpcode() == TargetOpcode::COPY &&
- !TargetRegisterInfo::isPhysicalRegister(Ret->getOperand(0).getReg()) &&
+ !Register::isPhysicalRegister(Ret->getOperand(0).getReg()) &&
MRI.hasOneUse(Ret->getOperand(0).getReg())) {
Ret = &(*MRI.use_instr_begin(Ret->getOperand(0).getReg()));
}
@@ -200,7 +212,7 @@ MipsRegisterBankInfo::AmbiguousRegDefUseContainer::skipCopiesIncoming(
const MachineRegisterInfo &MRI = MF.getRegInfo();
MachineInstr *Ret = MI;
while (Ret->getOpcode() == TargetOpcode::COPY &&
- !TargetRegisterInfo::isPhysicalRegister(Ret->getOperand(1).getReg()))
+ !Register::isPhysicalRegister(Ret->getOperand(1).getReg()))
Ret = MRI.getVRegDef(Ret->getOperand(1).getReg());
return Ret;
}
@@ -231,6 +243,9 @@ MipsRegisterBankInfo::AmbiguousRegDefUseContainer::AmbiguousRegDefUseContainer(
addUseDef(MI->getOperand(2).getReg(), MRI);
addUseDef(MI->getOperand(3).getReg(), MRI);
}
+
+ if (MI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
+ addDefUses(MI->getOperand(0).getReg(), MRI);
}
bool MipsRegisterBankInfo::TypeInfoForMF::visit(
@@ -318,8 +333,7 @@ void MipsRegisterBankInfo::TypeInfoForMF::setTypes(const MachineInstr *MI,
void MipsRegisterBankInfo::TypeInfoForMF::setTypesAccordingToPhysicalRegister(
const MachineInstr *MI, const MachineInstr *CopyInst, unsigned Op) {
- assert((TargetRegisterInfo::isPhysicalRegister(
- CopyInst->getOperand(Op).getReg())) &&
+ assert((Register::isPhysicalRegister(CopyInst->getOperand(Op).getReg())) &&
"Copies of non physical registers should not be considered here.\n");
const MachineFunction &MF = *CopyInst->getMF();
@@ -353,6 +367,31 @@ void MipsRegisterBankInfo::TypeInfoForMF::cleanupIfNewFunction(
}
}
+static const MipsRegisterBankInfo::ValueMapping *
+getMSAMapping(const MachineFunction &MF) {
+ assert(static_cast<const MipsSubtarget &>(MF.getSubtarget()).hasMSA() &&
+ "MSA mapping not available on target without MSA.");
+ return &Mips::ValueMappings[Mips::MSAIdx];
+}
+
+static const MipsRegisterBankInfo::ValueMapping *getFprbMapping(unsigned Size) {
+ return Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
+ : &Mips::ValueMappings[Mips::DPRIdx];
+}
+
+static const unsigned CustomMappingID = 1;
+
+// Only 64 bit mapping is available in fprb and will be marked as custom, i.e.
+// will be split into two 32 bit registers in gprb.
+static const MipsRegisterBankInfo::ValueMapping *
+getGprbOrCustomMapping(unsigned Size, unsigned &MappingID) {
+ if (Size == 32)
+ return &Mips::ValueMappings[Mips::GPRIdx];
+
+ MappingID = CustomMappingID;
+ return &Mips::ValueMappings[Mips::DPRIdx];
+}
+
const RegisterBankInfo::InstructionMapping &
MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
@@ -377,17 +416,35 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned NumOperands = MI.getNumOperands();
const ValueMapping *OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
unsigned MappingID = DefaultMappingID;
- const unsigned CustomMappingID = 1;
+
+ // Check if LLT sizes match sizes of available register banks.
+ for (const MachineOperand &Op : MI.operands()) {
+ if (Op.isReg()) {
+ LLT RegTy = MRI.getType(Op.getReg());
+
+ if (RegTy.isScalar() &&
+ (RegTy.getSizeInBits() != 32 && RegTy.getSizeInBits() != 64))
+ return getInvalidInstructionMapping();
+
+ if (RegTy.isVector() && RegTy.getSizeInBits() != 128)
+ return getInvalidInstructionMapping();
+ }
+ }
+
+ const LLT Op0Ty = MRI.getType(MI.getOperand(0).getReg());
+ unsigned Op0Size = Op0Ty.getSizeInBits();
+ InstType InstTy = InstType::Integer;
switch (Opc) {
case G_TRUNC:
- case G_ADD:
case G_SUB:
case G_MUL:
case G_UMULH:
case G_ZEXTLOAD:
case G_SEXTLOAD:
case G_GEP:
+ case G_INTTOPTR:
+ case G_PTRTOINT:
case G_AND:
case G_OR:
case G_XOR:
@@ -398,66 +455,42 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case G_UDIV:
case G_SREM:
case G_UREM:
+ case G_BRINDIRECT:
+ case G_VASTART:
OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
break;
- case G_LOAD: {
- unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- InstType InstTy = InstType::Integer;
- if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) {
- InstTy = TI.determineInstType(&MI);
- }
-
- if (InstTy == InstType::FloatingPoint ||
- (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb
- OperandsMapping =
- getOperandsMapping({Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
- : &Mips::ValueMappings[Mips::DPRIdx],
- &Mips::ValueMappings[Mips::GPRIdx]});
+ case G_ADD:
+ OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
+ if (Op0Size == 128)
+ OperandsMapping = getMSAMapping(MF);
+ break;
+ case G_STORE:
+ case G_LOAD:
+ if (Op0Size == 128) {
+ OperandsMapping = getOperandsMapping(
+ {getMSAMapping(MF), &Mips::ValueMappings[Mips::GPRIdx]});
break;
- } else { // gprb
- OperandsMapping =
- getOperandsMapping({Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx]
- : &Mips::ValueMappings[Mips::DPRIdx],
- &Mips::ValueMappings[Mips::GPRIdx]});
- if (Size == 64)
- MappingID = CustomMappingID;
}
- break;
- }
- case G_STORE: {
- unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- InstType InstTy = InstType::Integer;
- if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) {
+ if (!Op0Ty.isPointer())
InstTy = TI.determineInstType(&MI);
- }
if (InstTy == InstType::FloatingPoint ||
- (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb
- OperandsMapping =
- getOperandsMapping({Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
- : &Mips::ValueMappings[Mips::DPRIdx],
- &Mips::ValueMappings[Mips::GPRIdx]});
- break;
- } else { // gprb
+ (Op0Size == 64 && InstTy == InstType::Ambiguous))
+ OperandsMapping = getOperandsMapping(
+ {getFprbMapping(Op0Size), &Mips::ValueMappings[Mips::GPRIdx]});
+ else
OperandsMapping =
- getOperandsMapping({Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx]
- : &Mips::ValueMappings[Mips::DPRIdx],
+ getOperandsMapping({getGprbOrCustomMapping(Op0Size, MappingID),
&Mips::ValueMappings[Mips::GPRIdx]});
- if (Size == 64)
- MappingID = CustomMappingID;
- }
+
break;
- }
- case G_PHI: {
- unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- InstType InstTy = InstType::Integer;
- if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) {
+ case G_PHI:
+ if (!Op0Ty.isPointer())
InstTy = TI.determineInstType(&MI);
- }
// PHI is copylike and should have one regbank in mapping for def register.
- if (InstTy == InstType::Integer && Size == 64) { // fprb
+ if (InstTy == InstType::Integer && Op0Size == 64) {
OperandsMapping =
getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx]});
return getInstructionMapping(CustomMappingID, /*Cost=*/1, OperandsMapping,
@@ -465,80 +498,63 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
// Use default handling for PHI, i.e. set reg bank of def operand to match
// register banks of use operands.
- const RegisterBankInfo::InstructionMapping &Mapping =
- getInstrMappingImpl(MI);
- return Mapping;
- }
+ return getInstrMappingImpl(MI);
case G_SELECT: {
- unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- InstType InstTy = InstType::Integer;
- if (!MRI.getType(MI.getOperand(0).getReg()).isPointer()) {
+ if (!Op0Ty.isPointer())
InstTy = TI.determineInstType(&MI);
- }
if (InstTy == InstType::FloatingPoint ||
- (Size == 64 && InstTy == InstType::Ambiguous)) { // fprb
- const RegisterBankInfo::ValueMapping *Bank =
- Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
- : &Mips::ValueMappings[Mips::DPRIdx];
+ (Op0Size == 64 && InstTy == InstType::Ambiguous)) {
+ const RegisterBankInfo::ValueMapping *Bank = getFprbMapping(Op0Size);
OperandsMapping = getOperandsMapping(
{Bank, &Mips::ValueMappings[Mips::GPRIdx], Bank, Bank});
break;
- } else { // gprb
+ } else {
const RegisterBankInfo::ValueMapping *Bank =
- Size <= 32 ? &Mips::ValueMappings[Mips::GPRIdx]
- : &Mips::ValueMappings[Mips::DPRIdx];
+ getGprbOrCustomMapping(Op0Size, MappingID);
OperandsMapping = getOperandsMapping(
{Bank, &Mips::ValueMappings[Mips::GPRIdx], Bank, Bank});
- if (Size == 64)
- MappingID = CustomMappingID;
}
break;
}
- case G_UNMERGE_VALUES: {
+ case G_IMPLICIT_DEF:
+ if (!Op0Ty.isPointer())
+ InstTy = TI.determineInstType(&MI);
+
+ if (InstTy == InstType::FloatingPoint)
+ OperandsMapping = getFprbMapping(Op0Size);
+ else
+ OperandsMapping = getGprbOrCustomMapping(Op0Size, MappingID);
+
+ break;
+ case G_UNMERGE_VALUES:
OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx],
&Mips::ValueMappings[Mips::GPRIdx],
&Mips::ValueMappings[Mips::DPRIdx]});
MappingID = CustomMappingID;
break;
- }
- case G_MERGE_VALUES: {
+ case G_MERGE_VALUES:
OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx],
&Mips::ValueMappings[Mips::GPRIdx],
&Mips::ValueMappings[Mips::GPRIdx]});
MappingID = CustomMappingID;
break;
- }
case G_FADD:
case G_FSUB:
case G_FMUL:
case G_FDIV:
case G_FABS:
- case G_FSQRT:{
- unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- assert((Size == 32 || Size == 64) && "Unsupported floating point size");
- OperandsMapping = Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
- : &Mips::ValueMappings[Mips::DPRIdx];
+ case G_FSQRT:
+ OperandsMapping = getFprbMapping(Op0Size);
break;
- }
- case G_FCONSTANT: {
- unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- assert((Size == 32 || Size == 64) && "Unsupported floating point size");
- const RegisterBankInfo::ValueMapping *FPRValueMapping =
- Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
- : &Mips::ValueMappings[Mips::DPRIdx];
- OperandsMapping = getOperandsMapping({FPRValueMapping, nullptr});
+ case G_FCONSTANT:
+ OperandsMapping = getOperandsMapping({getFprbMapping(Op0Size), nullptr});
break;
- }
case G_FCMP: {
- unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
- assert((Size == 32 || Size == 64) && "Unsupported floating point size");
- const RegisterBankInfo::ValueMapping *FPRValueMapping =
- Size == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
- : &Mips::ValueMappings[Mips::DPRIdx];
+ unsigned Op2Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
OperandsMapping =
getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr,
- FPRValueMapping, FPRValueMapping});
+ getFprbMapping(Op2Size), getFprbMapping(Op2Size)});
break;
}
case G_FPEXT:
@@ -550,36 +566,31 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
&Mips::ValueMappings[Mips::DPRIdx]});
break;
case G_FPTOSI: {
+ assert((Op0Size == 32) && "Unsupported integer size");
unsigned SizeFP = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
- assert((MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 32) &&
- "Unsupported integer size");
- assert((SizeFP == 32 || SizeFP == 64) && "Unsupported floating point size");
- OperandsMapping = getOperandsMapping({
- &Mips::ValueMappings[Mips::GPRIdx],
- SizeFP == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
- : &Mips::ValueMappings[Mips::DPRIdx],
- });
+ OperandsMapping = getOperandsMapping(
+ {&Mips::ValueMappings[Mips::GPRIdx], getFprbMapping(SizeFP)});
break;
}
- case G_SITOFP: {
- unsigned SizeInt = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
- unsigned SizeFP = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- (void)SizeInt;
- assert((SizeInt == 32) && "Unsupported integer size");
- assert((SizeFP == 32 || SizeFP == 64) && "Unsupported floating point size");
- OperandsMapping =
- getOperandsMapping({SizeFP == 32 ? &Mips::ValueMappings[Mips::SPRIdx]
- : &Mips::ValueMappings[Mips::DPRIdx],
- &Mips::ValueMappings[Mips::GPRIdx]});
+ case G_SITOFP:
+ assert((MRI.getType(MI.getOperand(1).getReg()).getSizeInBits() == 32) &&
+ "Unsupported integer size");
+ OperandsMapping = getOperandsMapping(
+ {getFprbMapping(Op0Size), &Mips::ValueMappings[Mips::GPRIdx]});
break;
- }
case G_CONSTANT:
case G_FRAME_INDEX:
case G_GLOBAL_VALUE:
+ case G_JUMP_TABLE:
case G_BRCOND:
OperandsMapping =
getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr});
break;
+ case G_BRJT:
+ OperandsMapping =
+ getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr,
+ &Mips::ValueMappings[Mips::GPRIdx]});
+ break;
case G_ICMP:
OperandsMapping =
getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr,
@@ -609,11 +620,41 @@ public:
};
} // end anonymous namespace
-/// Here we have to narrowScalar s64 operands to s32, combine away
-/// G_MERGE/G_UNMERGE and erase instructions that became dead in the process.
-/// We manually assign 32 bit gprb to register operands of all new instructions
-/// that got created in the process since they will not end up in RegBankSelect
-/// loop. Careful not to delete instruction after MI i.e. MI.getIterator()++.
+void MipsRegisterBankInfo::setRegBank(MachineInstr &MI,
+ MachineRegisterInfo &MRI) const {
+ Register Dest = MI.getOperand(0).getReg();
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_STORE:
+ // No def operands, skip this instruction.
+ break;
+ case TargetOpcode::G_CONSTANT:
+ case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_SELECT:
+ case TargetOpcode::G_PHI:
+ case TargetOpcode::G_IMPLICIT_DEF: {
+ assert(MRI.getType(Dest) == LLT::scalar(32) && "Unexpected operand type.");
+ MRI.setRegBank(Dest, getRegBank(Mips::GPRBRegBankID));
+ break;
+ }
+ case TargetOpcode::G_GEP: {
+ assert(MRI.getType(Dest).isPointer() && "Unexpected operand type.");
+ MRI.setRegBank(Dest, getRegBank(Mips::GPRBRegBankID));
+ break;
+ }
+ default:
+ llvm_unreachable("Unexpected opcode.");
+ }
+}
+
+static void
+combineAwayG_UNMERGE_VALUES(LegalizationArtifactCombiner &ArtCombiner,
+ MachineInstr &MI) {
+ SmallVector<MachineInstr *, 2> DeadInstrs;
+ ArtCombiner.tryCombineMerges(MI, DeadInstrs);
+ for (MachineInstr *DeadMI : DeadInstrs)
+ DeadMI->eraseFromParent();
+}
+
void MipsRegisterBankInfo::applyMappingImpl(
const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
@@ -621,18 +662,19 @@ void MipsRegisterBankInfo::applyMappingImpl(
MachineIRBuilder B(MI);
MachineFunction *MF = MI.getMF();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
+ const LegalizerInfo &LegInfo = *MF->getSubtarget().getLegalizerInfo();
InstManager NewInstrObserver(NewInstrs);
GISelObserverWrapper WrapperObserver(&NewInstrObserver);
LegalizerHelper Helper(*MF, WrapperObserver, B);
- LegalizationArtifactCombiner ArtCombiner(
- B, MF->getRegInfo(), *MF->getSubtarget().getLegalizerInfo());
+ LegalizationArtifactCombiner ArtCombiner(B, MF->getRegInfo(), LegInfo);
switch (MI.getOpcode()) {
case TargetOpcode::G_LOAD:
case TargetOpcode::G_STORE:
case TargetOpcode::G_PHI:
- case TargetOpcode::G_SELECT: {
+ case TargetOpcode::G_SELECT:
+ case TargetOpcode::G_IMPLICIT_DEF: {
Helper.narrowScalar(MI, 0, LLT::scalar(32));
// Handle new instructions.
while (!NewInstrs.empty()) {
@@ -640,35 +682,21 @@ void MipsRegisterBankInfo::applyMappingImpl(
// This is new G_UNMERGE that was created during narrowScalar and will
// not be considered for regbank selection. RegBankSelect for mips
// visits/makes corresponding G_MERGE first. Combine them here.
- if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
- SmallVector<MachineInstr *, 2> DeadInstrs;
- ArtCombiner.tryCombineMerges(*NewMI, DeadInstrs);
- for (MachineInstr *DeadMI : DeadInstrs)
- DeadMI->eraseFromParent();
- }
+ if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
+ combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI);
// This G_MERGE will be combined away when its corresponding G_UNMERGE
// gets regBankSelected.
else if (NewMI->getOpcode() == TargetOpcode::G_MERGE_VALUES)
continue;
else
- // Manually set register banks for all register operands to 32 bit gprb.
- for (auto Op : NewMI->operands()) {
- if (Op.isReg()) {
- assert(MRI.getType(Op.getReg()).getSizeInBits() == 32 &&
- "Only 32 bit gprb is handled here.\n");
- MRI.setRegBank(Op.getReg(), getRegBank(Mips::GPRBRegBankID));
- }
- }
+ // Manually set register banks for def operands to 32 bit gprb.
+ setRegBank(*NewMI, MRI);
}
return;
}
- case TargetOpcode::G_UNMERGE_VALUES: {
- SmallVector<MachineInstr *, 2> DeadInstrs;
- ArtCombiner.tryCombineMerges(MI, DeadInstrs);
- for (MachineInstr *DeadMI : DeadInstrs)
- DeadMI->eraseFromParent();
+ case TargetOpcode::G_UNMERGE_VALUES:
+ combineAwayG_UNMERGE_VALUES(ArtCombiner, MI);
return;
- }
default:
break;
}
diff --git a/lib/Target/Mips/MipsRegisterBankInfo.h b/lib/Target/Mips/MipsRegisterBankInfo.h
index 176813c031ed..fa0f1c7bc941 100644
--- a/lib/Target/Mips/MipsRegisterBankInfo.h
+++ b/lib/Target/Mips/MipsRegisterBankInfo.h
@@ -38,8 +38,17 @@ public:
const InstructionMapping &
getInstrMapping(const MachineInstr &MI) const override;
+ /// Here we have to narrowScalar s64 operands to s32, combine away G_MERGE or
+ /// G_UNMERGE and erase instructions that became dead in the process. We
+ /// manually assign bank to def operand of all new instructions that were
+ /// created in the process since they will not end up in RegBankSelect loop.
void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+ /// RegBankSelect determined that s64 operand is better to be split into two
+ /// s32 operands in gprb. Here we manually set register banks of def operands
+ /// of newly created instructions since they will not get regbankselected.
+ void setRegBank(MachineInstr &MI, MachineRegisterInfo &MRI) const;
+
private:
/// Some instructions are used with both floating point and integer operands.
/// We assign InstType to such instructions as it helps us to avoid cross bank
diff --git a/lib/Target/Mips/MipsRegisterBanks.td b/lib/Target/Mips/MipsRegisterBanks.td
index 14a0181f8f11..7d11475884ce 100644
--- a/lib/Target/Mips/MipsRegisterBanks.td
+++ b/lib/Target/Mips/MipsRegisterBanks.td
@@ -11,4 +11,4 @@
def GPRBRegBank : RegisterBank<"GPRB", [GPR32]>;
-def FPRBRegBank : RegisterBank<"FPRB", [FGR64, AFGR64]>;
+def FPRBRegBank : RegisterBank<"FPRB", [FGR64, AFGR64, MSA128D]>;
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 4c6cc1ef771c..166ddea0431f 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -171,8 +171,8 @@ void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) {
assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
const TargetRegisterClass *RC = RegInfo.intRegClass(4);
- unsigned VR = MRI.createVirtualRegister(RC);
- unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+ Register VR = MRI.createVirtualRegister(RC);
+ Register Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
TII.loadRegFromStack(MBB, I, VR, FI, RC, &RegInfo, 0);
BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), Dst)
@@ -186,8 +186,8 @@ void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) {
assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
const TargetRegisterClass *RC = RegInfo.intRegClass(4);
- unsigned VR = MRI.createVirtualRegister(RC);
- unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+ Register VR = MRI.createVirtualRegister(RC);
+ Register Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), VR)
.addReg(Src, getKillRegState(I->getOperand(0).isKill()));
@@ -204,11 +204,11 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
- unsigned VR0 = MRI.createVirtualRegister(RC);
- unsigned VR1 = MRI.createVirtualRegister(RC);
- unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
- unsigned Lo = RegInfo.getSubReg(Dst, Mips::sub_lo);
- unsigned Hi = RegInfo.getSubReg(Dst, Mips::sub_hi);
+ Register VR0 = MRI.createVirtualRegister(RC);
+ Register VR1 = MRI.createVirtualRegister(RC);
+ Register Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+ Register Lo = RegInfo.getSubReg(Dst, Mips::sub_lo);
+ Register Hi = RegInfo.getSubReg(Dst, Mips::sub_hi);
DebugLoc DL = I->getDebugLoc();
const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
@@ -229,9 +229,9 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
- unsigned VR0 = MRI.createVirtualRegister(RC);
- unsigned VR1 = MRI.createVirtualRegister(RC);
- unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+ Register VR0 = MRI.createVirtualRegister(RC);
+ Register VR1 = MRI.createVirtualRegister(RC);
+ Register Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
unsigned SrcKill = getKillRegState(I->getOperand(0).isKill());
DebugLoc DL = I->getDebugLoc();
@@ -242,7 +242,7 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
}
bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) {
- unsigned Src = I->getOperand(1).getReg();
+ Register Src = I->getOperand(1).getReg();
std::pair<unsigned, unsigned> Opcodes = getMFHiLoOpc(Src);
if (!Opcodes.first)
@@ -262,11 +262,11 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I,
const TargetRegisterClass *DstRC = RegInfo.getMinimalPhysRegClass(Dst);
unsigned VRegSize = RegInfo.getRegSizeInBits(*DstRC) / 16;
const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize);
- unsigned VR0 = MRI.createVirtualRegister(RC);
- unsigned VR1 = MRI.createVirtualRegister(RC);
+ Register VR0 = MRI.createVirtualRegister(RC);
+ Register VR1 = MRI.createVirtualRegister(RC);
unsigned SrcKill = getKillRegState(I->getOperand(1).isKill());
- unsigned DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo);
- unsigned DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi);
+ Register DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo);
+ Register DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi);
DebugLoc DL = I->getDebugLoc();
BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
@@ -304,9 +304,9 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
// stack is used.
if (I->getNumOperands() == 4 && I->getOperand(3).isReg()
&& I->getOperand(3).getReg() == Mips::SP) {
- unsigned DstReg = I->getOperand(0).getReg();
- unsigned LoReg = I->getOperand(1).getReg();
- unsigned HiReg = I->getOperand(2).getReg();
+ Register DstReg = I->getOperand(0).getReg();
+ Register LoReg = I->getOperand(1).getReg();
+ Register HiReg = I->getOperand(2).getReg();
// It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are
// the cases where mthc1 is not available). 64-bit architectures and
@@ -346,7 +346,7 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
const MachineOperand &Op2 = I->getOperand(2);
if ((Op1.isReg() && Op1.isUndef()) || (Op2.isReg() && Op2.isUndef())) {
- unsigned DstReg = I->getOperand(0).getReg();
+ Register DstReg = I->getOperand(0).getReg();
BuildMI(MBB, I, I->getDebugLoc(), TII.get(Mips::IMPLICIT_DEF), DstReg);
return true;
}
@@ -369,8 +369,8 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
// stack is used.
if (I->getNumOperands() == 4 && I->getOperand(3).isReg()
&& I->getOperand(3).getReg() == Mips::SP) {
- unsigned DstReg = I->getOperand(0).getReg();
- unsigned SrcReg = Op1.getReg();
+ Register DstReg = I->getOperand(0).getReg();
+ Register SrcReg = Op1.getReg();
unsigned N = Op2.getImm();
int64_t Offset = 4 * (Subtarget.isLittle() ? N : (1 - N));
@@ -538,7 +538,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
if (RegInfo.needsStackRealignment(MF)) {
// addiu $Reg, $zero, -MaxAlignment
// andi $sp, $sp, $Reg
- unsigned VR = MF.getRegInfo().createVirtualRegister(RC);
+ Register VR = MF.getRegInfo().createVirtualRegister(RC);
assert(isInt<16>(MFI.getMaxAlignment()) &&
"Function's alignment size requirement is not supported.");
int MaxAlign = -(int)MFI.getMaxAlignment();
@@ -865,12 +865,15 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
MipsABIInfo ABI = STI.getABI();
+ unsigned RA = ABI.IsN64() ? Mips::RA_64 : Mips::RA;
unsigned FP = ABI.GetFramePtr();
unsigned BP = ABI.IsN64() ? Mips::S7_64 : Mips::S7;
- // Mark $fp as used if function has dedicated frame pointer.
- if (hasFP(MF))
+ // Mark $ra and $fp as used if function has dedicated frame pointer.
+ if (hasFP(MF)) {
+ setAliasRegs(MF, SavedRegs, RA);
setAliasRegs(MF, SavedRegs, FP);
+ }
// Mark $s7 as used if function has dedicated base pointer.
if (hasBP(MF))
setAliasRegs(MF, SavedRegs, BP);
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 703f99f37dd1..c8313240a678 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -124,6 +124,33 @@ bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
return true;
}
+void MipsSEDAGToDAGISel::emitMCountABI(MachineInstr &MI, MachineBasicBlock &MBB,
+ MachineFunction &MF) {
+ MachineInstrBuilder MIB(MF, &MI);
+ if (!Subtarget->isABI_O32()) { // N32, N64
+ // Save current return address.
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Mips::OR64))
+ .addDef(Mips::AT_64)
+ .addUse(Mips::RA_64, RegState::Undef)
+ .addUse(Mips::ZERO_64);
+ // Stops instruction above from being removed later on.
+ MIB.addUse(Mips::AT_64, RegState::Implicit);
+ } else { // O32
+ // Save current return address.
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Mips::OR))
+ .addDef(Mips::AT)
+ .addUse(Mips::RA, RegState::Undef)
+ .addUse(Mips::ZERO);
+ // _mcount pops 2 words from stack.
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Mips::ADDiu))
+ .addDef(Mips::SP)
+ .addUse(Mips::SP)
+ .addImm(-8);
+ // Stops first instruction above from being removed later on.
+ MIB.addUse(Mips::AT, RegState::Implicit);
+ }
+}
+
void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
MF.getInfo<MipsFunctionInfo>()->initGlobalBaseReg();
@@ -150,6 +177,24 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
if (Subtarget->isABI_FPXX() && !Subtarget->hasMTHC1())
MI.addOperand(MachineOperand::CreateReg(Mips::SP, false, true));
break;
+ case Mips::JAL:
+ case Mips::JAL_MM:
+ if (MI.getOperand(0).isGlobal() &&
+ MI.getOperand(0).getGlobal()->getGlobalIdentifier() == "_mcount")
+ emitMCountABI(MI, MBB, MF);
+ break;
+ case Mips::JALRPseudo:
+ case Mips::JALR64Pseudo:
+ case Mips::JALR16_MM:
+ if (MI.getOperand(2).isMCSymbol() &&
+ MI.getOperand(2).getMCSymbol()->getName() == "_mcount")
+ emitMCountABI(MI, MBB, MF);
+ break;
+ case Mips::JALR:
+ if (MI.getOperand(3).isMCSymbol() &&
+ MI.getOperand(3).getMCSymbol()->getName() == "_mcount")
+ emitMCountABI(MI, MBB, MF);
+ break;
default:
replaceUsesWithZeroReg(MRI, MI);
}
@@ -247,7 +292,8 @@ bool MipsSEDAGToDAGISel::selectAddrFrameIndexOffset(
Base = Addr.getOperand(0);
// If base is a FI, additional offset calculation is done in
// eliminateFrameIndex, otherwise we need to check the alignment
- if (OffsetToAlignment(CN->getZExtValue(), 1ull << ShiftAmount) != 0)
+ const Align Alignment(1ULL << ShiftAmount);
+ if (!isAligned(Alignment, CN->getZExtValue()))
return false;
}
@@ -719,7 +765,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
}
case ISD::ConstantFP: {
- ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
+ auto *CN = cast<ConstantFPSDNode>(Node);
if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
if (Subtarget->isGP64bit()) {
SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
@@ -743,7 +789,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
}
case ISD::Constant: {
- const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Node);
+ auto *CN = cast<ConstantSDNode>(Node);
int64_t Imm = CN->getSExtValue();
unsigned Size = CN->getValueSizeInBits(0);
@@ -969,7 +1015,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
break;
}
- SDNode *Res;
+ SDNode *Res = nullptr;
// If we have a signed 10 bit integer, we can splat it directly.
//
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index ce594e1fb4fa..39f665be571e 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -120,7 +120,7 @@ private:
/// power of 2.
bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const override;
/// Select constant vector splats whose value is a run of set bits
- /// ending at the most significant bit
+ /// ending at the most significant bit.
bool selectVSplatMaskL(SDValue N, SDValue &Imm) const override;
/// Select constant vector splats whose value is a run of set bits
/// starting at bit zero.
@@ -128,6 +128,10 @@ private:
bool trySelect(SDNode *Node) override;
+ // Emits proper ABI for _mcount profiling calls.
+ void emitMCountABI(MachineInstr &MI, MachineBasicBlock &MBB,
+ MachineFunction &MF);
+
void processFunctionAfterISel(MachineFunction &MF) override;
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index edf57a3840d1..5bd234f955ba 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -71,8 +71,8 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
if (Subtarget.hasDSP() || Subtarget.hasMSA()) {
// Expand all truncating stores and extending loads.
- for (MVT VT0 : MVT::vector_valuetypes()) {
- for (MVT VT1 : MVT::vector_valuetypes()) {
+ for (MVT VT0 : MVT::fixedlen_vector_valuetypes()) {
+ for (MVT VT1 : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(VT0, VT1, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT0, VT1, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT0, VT1, Expand);
@@ -327,6 +327,7 @@ addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+ setOperationAction(ISD::UNDEF, Ty, Legal);
setOperationAction(ISD::ADD, Ty, Legal);
setOperationAction(ISD::AND, Ty, Legal);
@@ -2595,7 +2596,8 @@ static SDValue lowerVECTOR_SHUFFLE_SHF(SDValue Op, EVT ResTy,
SDLoc DL(Op);
return DAG.getNode(MipsISD::SHF, DL, ResTy,
- DAG.getConstant(Imm, DL, MVT::i32), Op->getOperand(0));
+ DAG.getTargetConstant(Imm, DL, MVT::i32),
+ Op->getOperand(0));
}
/// Determine whether a range fits a regular pattern of values.
@@ -3062,13 +3064,13 @@ MipsSETargetLowering::emitBPOSGE32(MachineInstr &MI,
BuildMI(BB, DL, TII->get(Mips::BPOSGE32C_MMR3)).addMBB(TBB);
// Fill $FBB.
- unsigned VR2 = RegInfo.createVirtualRegister(RC);
+ Register VR2 = RegInfo.createVirtualRegister(RC);
BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), VR2)
.addReg(Mips::ZERO).addImm(0);
BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
// Fill $TBB.
- unsigned VR1 = RegInfo.createVirtualRegister(RC);
+ Register VR1 = RegInfo.createVirtualRegister(RC);
BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), VR1)
.addReg(Mips::ZERO).addImm(1);
@@ -3131,13 +3133,13 @@ MachineBasicBlock *MipsSETargetLowering::emitMSACBranchPseudo(
.addMBB(TBB);
// Fill $FBB.
- unsigned RD1 = RegInfo.createVirtualRegister(RC);
+ Register RD1 = RegInfo.createVirtualRegister(RC);
BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), RD1)
.addReg(Mips::ZERO).addImm(0);
BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
// Fill $TBB.
- unsigned RD2 = RegInfo.createVirtualRegister(RC);
+ Register RD2 = RegInfo.createVirtualRegister(RC);
BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), RD2)
.addReg(Mips::ZERO).addImm(1);
@@ -3169,8 +3171,8 @@ MipsSETargetLowering::emitCOPY_FW(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Fd = MI.getOperand(0).getReg();
- unsigned Ws = MI.getOperand(1).getReg();
+ Register Fd = MI.getOperand(0).getReg();
+ Register Ws = MI.getOperand(1).getReg();
unsigned Lane = MI.getOperand(2).getImm();
if (Lane == 0) {
@@ -3185,9 +3187,9 @@ MipsSETargetLowering::emitCOPY_FW(MachineInstr &MI,
BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
} else {
- unsigned Wt = RegInfo.createVirtualRegister(
- Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
- &Mips::MSA128WEvensRegClass);
+ Register Wt = RegInfo.createVirtualRegister(
+ Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
+ : &Mips::MSA128WEvensRegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(Lane);
BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
@@ -3214,15 +3216,15 @@ MipsSETargetLowering::emitCOPY_FD(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
- unsigned Fd = MI.getOperand(0).getReg();
- unsigned Ws = MI.getOperand(1).getReg();
+ Register Fd = MI.getOperand(0).getReg();
+ Register Ws = MI.getOperand(1).getReg();
unsigned Lane = MI.getOperand(2).getImm() * 2;
DebugLoc DL = MI.getDebugLoc();
if (Lane == 0)
BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64);
else {
- unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+ Register Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wt).addReg(Ws).addImm(1);
BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64);
@@ -3244,13 +3246,13 @@ MipsSETargetLowering::emitINSERT_FW(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Wd = MI.getOperand(0).getReg();
- unsigned Wd_in = MI.getOperand(1).getReg();
+ Register Wd = MI.getOperand(0).getReg();
+ Register Wd_in = MI.getOperand(1).getReg();
unsigned Lane = MI.getOperand(2).getImm();
- unsigned Fs = MI.getOperand(3).getReg();
- unsigned Wt = RegInfo.createVirtualRegister(
- Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
- &Mips::MSA128WEvensRegClass);
+ Register Fs = MI.getOperand(3).getReg();
+ Register Wt = RegInfo.createVirtualRegister(
+ Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
+ : &Mips::MSA128WEvensRegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
.addImm(0)
@@ -3280,11 +3282,11 @@ MipsSETargetLowering::emitINSERT_FD(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Wd = MI.getOperand(0).getReg();
- unsigned Wd_in = MI.getOperand(1).getReg();
+ Register Wd = MI.getOperand(0).getReg();
+ Register Wd_in = MI.getOperand(1).getReg();
unsigned Lane = MI.getOperand(2).getImm();
- unsigned Fs = MI.getOperand(3).getReg();
- unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+ Register Fs = MI.getOperand(3).getReg();
+ Register Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
.addImm(0)
@@ -3326,10 +3328,10 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Wd = MI.getOperand(0).getReg();
- unsigned SrcVecReg = MI.getOperand(1).getReg();
- unsigned LaneReg = MI.getOperand(2).getReg();
- unsigned SrcValReg = MI.getOperand(3).getReg();
+ Register Wd = MI.getOperand(0).getReg();
+ Register SrcVecReg = MI.getOperand(1).getReg();
+ Register LaneReg = MI.getOperand(2).getReg();
+ Register SrcValReg = MI.getOperand(3).getReg();
const TargetRegisterClass *VecRC = nullptr;
// FIXME: This should be true for N32 too.
@@ -3370,7 +3372,7 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
}
if (IsFP) {
- unsigned Wt = RegInfo.createVirtualRegister(VecRC);
+ Register Wt = RegInfo.createVirtualRegister(VecRC);
BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
.addImm(0)
.addReg(SrcValReg)
@@ -3380,7 +3382,7 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
// Convert the lane index into a byte index
if (EltSizeInBytes != 1) {
- unsigned LaneTmp1 = RegInfo.createVirtualRegister(GPRRC);
+ Register LaneTmp1 = RegInfo.createVirtualRegister(GPRRC);
BuildMI(*BB, MI, DL, TII->get(ShiftOp), LaneTmp1)
.addReg(LaneReg)
.addImm(EltLog2Size);
@@ -3388,13 +3390,13 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
}
// Rotate bytes around so that the desired lane is element zero
- unsigned WdTmp1 = RegInfo.createVirtualRegister(VecRC);
+ Register WdTmp1 = RegInfo.createVirtualRegister(VecRC);
BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), WdTmp1)
.addReg(SrcVecReg)
.addReg(SrcVecReg)
.addReg(LaneReg, 0, SubRegIdx);
- unsigned WdTmp2 = RegInfo.createVirtualRegister(VecRC);
+ Register WdTmp2 = RegInfo.createVirtualRegister(VecRC);
if (IsFP) {
// Use insve.df to insert to element zero
BuildMI(*BB, MI, DL, TII->get(InsveOp), WdTmp2)
@@ -3413,7 +3415,7 @@ MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
// Rotate elements the rest of the way for a full rotation.
// sld.df inteprets $rt modulo the number of columns so we only need to negate
// the lane index to do this.
- unsigned LaneTmp2 = RegInfo.createVirtualRegister(GPRRC);
+ Register LaneTmp2 = RegInfo.createVirtualRegister(GPRRC);
BuildMI(*BB, MI, DL, TII->get(Subtarget.isABI_N64() ? Mips::DSUB : Mips::SUB),
LaneTmp2)
.addReg(Subtarget.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO)
@@ -3440,12 +3442,12 @@ MipsSETargetLowering::emitFILL_FW(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Wd = MI.getOperand(0).getReg();
- unsigned Fs = MI.getOperand(1).getReg();
- unsigned Wt1 = RegInfo.createVirtualRegister(
+ Register Wd = MI.getOperand(0).getReg();
+ Register Fs = MI.getOperand(1).getReg();
+ Register Wt1 = RegInfo.createVirtualRegister(
Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
: &Mips::MSA128WEvensRegClass);
- unsigned Wt2 = RegInfo.createVirtualRegister(
+ Register Wt2 = RegInfo.createVirtualRegister(
Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
: &Mips::MSA128WEvensRegClass);
@@ -3475,10 +3477,10 @@ MipsSETargetLowering::emitFILL_FD(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Wd = MI.getOperand(0).getReg();
- unsigned Fs = MI.getOperand(1).getReg();
- unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
- unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+ Register Wd = MI.getOperand(0).getReg();
+ Register Fs = MI.getOperand(1).getReg();
+ Register Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+ Register Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
@@ -3509,8 +3511,8 @@ MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Ws = MI.getOperand(0).getReg();
- unsigned Rt = MI.getOperand(1).getReg();
+ Register Ws = MI.getOperand(0).getReg();
+ Register Rt = MI.getOperand(1).getReg();
const MachineMemOperand &MMO = **MI.memoperands_begin();
unsigned Imm = MMO.getOffset();
@@ -3522,11 +3524,11 @@ MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI,
: (Subtarget.isABI_O32() ? &Mips::GPR32RegClass
: &Mips::GPR64RegClass);
const bool UsingMips32 = RC == &Mips::GPR32RegClass;
- unsigned Rs = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
+ Register Rs = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::COPY_U_H), Rs).addReg(Ws).addImm(0);
if(!UsingMips32) {
- unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR64RegClass);
+ Register Tmp = RegInfo.createVirtualRegister(&Mips::GPR64RegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Tmp)
.addImm(0)
.addReg(Rs)
@@ -3564,7 +3566,7 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Wd = MI.getOperand(0).getReg();
+ Register Wd = MI.getOperand(0).getReg();
// Caution: A load via the GOT can expand to a GPR32 operand, a load via
// spill and reload can expand as a GPR64 operand. Examine the
@@ -3575,7 +3577,7 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
: &Mips::GPR64RegClass);
const bool UsingMips32 = RC == &Mips::GPR32RegClass;
- unsigned Rt = RegInfo.createVirtualRegister(RC);
+ Register Rt = RegInfo.createVirtualRegister(RC);
MachineInstrBuilder MIB =
BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::LH : Mips::LH64), Rt);
@@ -3583,7 +3585,7 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
MIB.add(MI.getOperand(i));
if(!UsingMips32) {
- unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
+ Register Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Tmp).addReg(Rt, 0, Mips::sub_32);
Rt = Tmp;
}
@@ -3658,11 +3660,11 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Wd = MI.getOperand(0).getReg();
- unsigned Fs = MI.getOperand(1).getReg();
+ Register Wd = MI.getOperand(0).getReg();
+ Register Fs = MI.getOperand(1).getReg();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
- unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+ Register Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
const TargetRegisterClass *GPRRC =
IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
unsigned MFC1Opc = IsFGR64onMips64
@@ -3671,16 +3673,16 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
unsigned FILLOpc = IsFGR64onMips64 ? Mips::FILL_D : Mips::FILL_W;
// Perform the register class copy as mentioned above.
- unsigned Rtemp = RegInfo.createVirtualRegister(GPRRC);
+ Register Rtemp = RegInfo.createVirtualRegister(GPRRC);
BuildMI(*BB, MI, DL, TII->get(MFC1Opc), Rtemp).addReg(Fs);
BuildMI(*BB, MI, DL, TII->get(FILLOpc), Wtemp).addReg(Rtemp);
unsigned WPHI = Wtemp;
if (IsFGR64onMips32) {
- unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
+ Register Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
BuildMI(*BB, MI, DL, TII->get(Mips::MFHC1_D64), Rtemp2).addReg(Fs);
- unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
- unsigned Wtemp3 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+ Register Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+ Register Wtemp3 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_W), Wtemp2)
.addReg(Wtemp)
.addReg(Rtemp2)
@@ -3693,7 +3695,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
}
if (IsFGR64) {
- unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+ Register Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::FEXDO_W), Wtemp2)
.addReg(WPHI)
.addReg(WPHI);
@@ -3817,8 +3819,8 @@ MipsSETargetLowering::emitFEXP2_W_1(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
const TargetRegisterClass *RC = &Mips::MSA128WRegClass;
- unsigned Ws1 = RegInfo.createVirtualRegister(RC);
- unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+ Register Ws1 = RegInfo.createVirtualRegister(RC);
+ Register Ws2 = RegInfo.createVirtualRegister(RC);
DebugLoc DL = MI.getDebugLoc();
// Splat 1.0 into a vector
@@ -3846,8 +3848,8 @@ MipsSETargetLowering::emitFEXP2_D_1(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
const TargetRegisterClass *RC = &Mips::MSA128DRegClass;
- unsigned Ws1 = RegInfo.createVirtualRegister(RC);
- unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+ Register Ws1 = RegInfo.createVirtualRegister(RC);
+ Register Ws2 = RegInfo.createVirtualRegister(RC);
DebugLoc DL = MI.getDebugLoc();
// Splat 1.0 into a vector
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 4e49f5e7d9d1..2126a1bda493 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -628,7 +628,7 @@ unsigned MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
// The first instruction can be a LUi, which is different from other
// instructions (ADDiu, ORI and SLL) in that it does not have a register
// operand.
- unsigned Reg = RegInfo.createVirtualRegister(RC);
+ Register Reg = RegInfo.createVirtualRegister(RC);
if (Inst->Opc == LUi)
BuildMI(MBB, II, DL, get(LUi), Reg).addImm(SignExtend64<16>(Inst->ImmOpnd));
@@ -734,9 +734,9 @@ void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB,
// Add lo/hi registers if the mtlo/hi instructions created have explicit
// def registers.
if (HasExplicitDef) {
- unsigned DstReg = I->getOperand(0).getReg();
- unsigned DstLo = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
- unsigned DstHi = getRegisterInfo().getSubReg(DstReg, Mips::sub_hi);
+ Register DstReg = I->getOperand(0).getReg();
+ Register DstLo = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+ Register DstHi = getRegisterInfo().getSubReg(DstReg, Mips::sub_hi);
LoInst.addReg(DstLo, RegState::Define);
HiInst.addReg(DstHi, RegState::Define);
}
@@ -773,14 +773,14 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
bool isMicroMips,
bool FP64) const {
- unsigned DstReg = I->getOperand(0).getReg();
- unsigned SrcReg = I->getOperand(1).getReg();
+ Register DstReg = I->getOperand(0).getReg();
+ Register SrcReg = I->getOperand(1).getReg();
unsigned N = I->getOperand(2).getImm();
DebugLoc dl = I->getDebugLoc();
assert(N < 2 && "Invalid immediate");
unsigned SubIdx = N ? Mips::sub_hi : Mips::sub_lo;
- unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
+ Register SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
// FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload
// in MipsSEFrameLowering.cpp.
@@ -815,7 +815,7 @@ void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
bool isMicroMips, bool FP64) const {
- unsigned DstReg = I->getOperand(0).getReg();
+ Register DstReg = I->getOperand(0).getReg();
unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
DebugLoc dl = I->getDebugLoc();
@@ -883,8 +883,8 @@ void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
unsigned RA = Subtarget.isGP64bit() ? Mips::RA_64 : Mips::RA;
unsigned T9 = Subtarget.isGP64bit() ? Mips::T9_64 : Mips::T9;
unsigned ZERO = Subtarget.isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
- unsigned OffsetReg = I->getOperand(0).getReg();
- unsigned TargetReg = I->getOperand(1).getReg();
+ Register OffsetReg = I->getOperand(0).getReg();
+ Register TargetReg = I->getOperand(1).getReg();
// addu $ra, $v0, $zero
// addu $sp, $sp, $v1
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index f4b164d5c0ab..a48088c28919 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -212,11 +212,9 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
// element size), otherwise it is a 16-bit signed immediate.
unsigned OffsetBitSize =
getLoadStoreOffsetSizeInBits(MI.getOpcode(), MI.getOperand(OpNo - 1));
- unsigned OffsetAlign = getLoadStoreOffsetAlign(MI.getOpcode());
-
+ const Align OffsetAlign(getLoadStoreOffsetAlign(MI.getOpcode()));
if (OffsetBitSize < 16 && isInt<16>(Offset) &&
- (!isIntN(OffsetBitSize, Offset) ||
- OffsetToAlignment(Offset, OffsetAlign) != 0)) {
+ (!isIntN(OffsetBitSize, Offset) || !isAligned(OffsetAlign, Offset))) {
// If we have an offset that needs to fit into a signed n-bit immediate
// (where n < 16) and doesn't, but does fit into 16-bits then use an ADDiu
MachineBasicBlock &MBB = *MI.getParent();
@@ -224,7 +222,7 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
const TargetRegisterClass *PtrRC =
ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
- unsigned Reg = RegInfo.createVirtualRegister(PtrRC);
+ Register Reg = RegInfo.createVirtualRegister(PtrRC);
const MipsSEInstrInfo &TII =
*static_cast<const MipsSEInstrInfo *>(
MBB.getParent()->getSubtarget().getInstrInfo());
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index d021b3d021b1..b9245c9fc0eb 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -69,7 +69,7 @@ void MipsSubtarget::anchor() {}
MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
bool little, const MipsTargetMachine &TM,
- unsigned StackAlignOverride)
+ MaybeAlign StackAlignOverride)
: MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault),
IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false),
NoABICalls(false), Abs2008(false), IsFP64bit(false), UseOddSPReg(true),
@@ -81,10 +81,9 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false),
HasEVA(false), DisableMadd4(false), HasMT(false), HasCRC(false),
HasVirt(false), HasGINV(false), UseIndirectJumpsHazard(false),
- StackAlignOverride(StackAlignOverride),
- TM(TM), TargetTriple(TT), TSInfo(),
- InstrInfo(
- MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
+ StackAlignOverride(StackAlignOverride), TM(TM), TargetTriple(TT),
+ TSInfo(), InstrInfo(MipsInstrInfo::create(
+ initializeSubtargetDependencies(CPU, FS, TM))),
FrameLowering(MipsFrameLowering::create(*this)),
TLInfo(MipsTargetLowering::create(TM, *this)) {
@@ -248,12 +247,12 @@ MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
InMips16HardFloat = true;
if (StackAlignOverride)
- stackAlignment = StackAlignOverride;
+ stackAlignment = *StackAlignOverride;
else if (isABI_N32() || isABI_N64())
- stackAlignment = 16;
+ stackAlignment = Align(16);
else {
assert(isABI_O32() && "Unknown ABI for stack alignment!");
- stackAlignment = 8;
+ stackAlignment = Align(8);
}
return *this;
@@ -286,6 +285,6 @@ const RegisterBankInfo *MipsSubtarget::getRegBankInfo() const {
return RegBankInfo.get();
}
-const InstructionSelector *MipsSubtarget::getInstructionSelector() const {
+InstructionSelector *MipsSubtarget::getInstructionSelector() const {
return InstSelector.get();
}
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index aa1200579fc8..0a8c2ef8ae5c 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -189,12 +189,15 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
// Disable use of the `jal` instruction.
bool UseLongCalls = false;
+ // Assume 32-bit GOT.
+ bool UseXGOT = false;
+
/// The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
- unsigned stackAlignment;
+ Align stackAlignment;
/// The overridden stack alignment.
- unsigned StackAlignOverride;
+ MaybeAlign StackAlignOverride;
InstrItineraryData InstrItins;
@@ -227,7 +230,7 @@ public:
/// This constructor initializes the data members to match that
/// of the specified triple.
MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, bool little,
- const MipsTargetMachine &TM, unsigned StackAlignOverride);
+ const MipsTargetMachine &TM, MaybeAlign StackAlignOverride);
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
@@ -323,6 +326,8 @@ public:
bool useLongCalls() const { return UseLongCalls; }
+ bool useXGOT() const { return UseXGOT; }
+
bool enableLongBranchPass() const {
return hasStandardEncoding() || inMicroMipsMode() || allowMixed16_32();
}
@@ -344,7 +349,7 @@ public:
// really use them if in addition we are in mips16 mode
static bool useConstantIslands();
- unsigned getStackAlignment() const { return stackAlignment; }
+ Align getStackAlignment() const { return stackAlignment; }
// Grab relocation model
Reloc::Model getRelocationModel() const;
@@ -391,7 +396,7 @@ public:
const CallLowering *getCallLowering() const override;
const LegalizerInfo *getLegalizerInfo() const override;
const RegisterBankInfo *getRegBankInfo() const override;
- const InstructionSelector *getInstructionSelector() const override;
+ InstructionSelector *getInstructionSelector() const override;
};
} // End llvm namespace
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index c878abb042e4..e58f316791ba 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -117,14 +117,17 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
: LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
CPU, FS, Options, getEffectiveRelocModel(JIT, RM),
getEffectiveCodeModel(CM, CodeModel::Small), OL),
- isLittle(isLittle), TLOF(llvm::make_unique<MipsTargetObjectFile>()),
+ isLittle(isLittle), TLOF(std::make_unique<MipsTargetObjectFile>()),
ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)),
- Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this,
- Options.StackAlignmentOverride),
+ Subtarget(nullptr),
+ DefaultSubtarget(TT, CPU, FS, isLittle, *this,
+ MaybeAlign(Options.StackAlignmentOverride)),
NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16",
- isLittle, *this, Options.StackAlignmentOverride),
+ isLittle, *this,
+ MaybeAlign(Options.StackAlignmentOverride)),
Mips16Subtarget(TT, CPU, FS.empty() ? "+mips16" : FS.str() + ",+mips16",
- isLittle, *this, Options.StackAlignmentOverride) {
+ isLittle, *this,
+ MaybeAlign(Options.StackAlignmentOverride)) {
Subtarget = &DefaultSubtarget;
initAsmInfo();
}
@@ -196,8 +199,9 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<MipsSubtarget>(TargetTriple, CPU, FS, isLittle, *this,
- Options.StackAlignmentOverride);
+ I = std::make_unique<MipsSubtarget>(
+ TargetTriple, CPU, FS, isLittle, *this,
+ MaybeAlign(Options.StackAlignmentOverride));
}
return I.get();
}
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index 1fa8ebadd643..298d056ce2c3 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -130,6 +130,8 @@ public:
SMLoc IDLoc, const MCSubtargetInfo *STI);
void emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2,
SMLoc IDLoc, const MCSubtargetInfo *STI);
+ void emitRRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2,
+ MCOperand Op3, SMLoc IDLoc, const MCSubtargetInfo *STI);
void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm,
SMLoc IDLoc, const MCSubtargetInfo *STI);
void emitRRIII(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm0,
@@ -154,17 +156,13 @@ public:
unsigned BaseReg, int64_t Offset,
function_ref<unsigned()> GetATReg, SMLoc IDLoc,
const MCSubtargetInfo *STI);
- void emitStoreWithSymOffset(unsigned Opcode, unsigned SrcReg,
- unsigned BaseReg, MCOperand &HiOperand,
- MCOperand &LoOperand, unsigned ATReg, SMLoc IDLoc,
- const MCSubtargetInfo *STI);
+ void emitSCWithSymOffset(unsigned Opcode, unsigned SrcReg, unsigned BaseReg,
+ MCOperand &HiOperand, MCOperand &LoOperand,
+ unsigned ATReg, SMLoc IDLoc,
+ const MCSubtargetInfo *STI);
void emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg,
int64_t Offset, unsigned TmpReg, SMLoc IDLoc,
const MCSubtargetInfo *STI);
- void emitLoadWithSymOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg,
- MCOperand &HiOperand, MCOperand &LoOperand,
- unsigned ATReg, SMLoc IDLoc,
- const MCSubtargetInfo *STI);
void emitGPRestore(int Offset, SMLoc IDLoc, const MCSubtargetInfo *STI);
void forbidModuleDirective() { ModuleDirectiveAllowed = false; }
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 6530c40ea100..0acbace5f848 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -44,7 +44,7 @@ MachineFunctionPass *createNVPTXPrologEpilogPass();
MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
FunctionPass *createNVPTXImageOptimizerPass();
FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM);
-BasicBlockPass *createNVPTXLowerAllocaPass();
+FunctionPass *createNVPTXLowerAllocaPass();
MachineFunctionPass *createNVPTXPeephole();
MachineFunctionPass *createNVPTXProxyRegErasurePass();
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 5f38b4a3c4c5..307f4d58c3ab 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -282,7 +282,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
}
unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
DenseMap<unsigned, unsigned> &RegMap = VRegMapping[RC];
@@ -434,7 +434,7 @@ bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
return false;
}
-void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
AsmPrinter::EmitBasicBlockStart(MBB);
if (isLoopHeaderOfNoUnroll(MBB))
OutStreamer->EmitRawText(StringRef("\t.pragma \"nounroll\";\n"));
@@ -507,8 +507,8 @@ const MCSymbol *NVPTXAsmPrinter::getFunctionFrameSymbol() const {
}
void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
- unsigned RegNo = MI->getOperand(0).getReg();
- if (TargetRegisterInfo::isVirtualRegister(RegNo)) {
+ Register RegNo = MI->getOperand(0).getReg();
+ if (Register::isVirtualRegister(RegNo)) {
OutStreamer->AddComment(Twine("implicit-def: ") +
getVirtualRegisterName(RegNo));
} else {
@@ -1397,7 +1397,7 @@ static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) {
auto *FTy = dyn_cast<FunctionType>(Ty);
if (FTy)
- return DL.getPointerPrefAlignment();
+ return DL.getPointerPrefAlignment().value();
return DL.getPrefTypeAlignment(Ty);
}
@@ -1473,12 +1473,11 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
// Just print .param .align <a> .b8 .param[size];
// <a> = PAL.getparamalignment
// size = typeallocsize of element type
- unsigned align = PAL.getParamAlignment(paramIndex);
- if (align == 0)
- align = DL.getABITypeAlignment(Ty);
+ const Align align = DL.getValueOrABITypeAlignment(
+ PAL.getParamAlignment(paramIndex), Ty);
unsigned sz = DL.getTypeAllocSize(Ty);
- O << "\t.param .align " << align << " .b8 ";
+ O << "\t.param .align " << align.value() << " .b8 ";
printParamName(I, paramIndex, O);
O << "[" << sz << "]";
@@ -1559,9 +1558,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
// Just print .param .align <a> .b8 .param[size];
// <a> = PAL.getparamalignment
// size = typeallocsize of element type
- unsigned align = PAL.getParamAlignment(paramIndex);
- if (align == 0)
- align = DL.getABITypeAlignment(ETy);
+ Align align =
+ DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy);
// Work around a bug in ptxas. When PTX code takes address of
// byval parameter with alignment < 4, ptxas generates code to
// spill argument into memory. Alas on sm_50+ ptxas generates
@@ -1573,10 +1571,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
// TODO: this will need to be undone when we get to support multi-TU
// device-side compilation as it breaks ABI compatibility with nvcc.
// Hopefully ptxas bug is fixed by then.
- if (!isKernelFunc && align < 4)
- align = 4;
+ if (!isKernelFunc && align < Align(4))
+ align = Align(4);
unsigned sz = DL.getTypeAllocSize(ETy);
- O << "\t.param .align " << align << " .b8 ";
+ O << "\t.param .align " << align.value() << " .b8 ";
printParamName(I, paramIndex, O);
O << "[" << sz << "]";
continue;
@@ -1653,7 +1651,7 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
// We use the per class virtual register number in the ptx output.
unsigned int numVRs = MRI->getNumVirtRegs();
for (unsigned i = 0; i < numVRs; i++) {
- unsigned int vr = TRI->index2VirtReg(i);
+ unsigned int vr = Register::index2VirtReg(i);
const TargetRegisterClass *RC = MRI->getRegClass(vr);
DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
int n = regmap.size();
@@ -1861,7 +1859,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
case Type::HalfTyID:
case Type::FloatTyID:
case Type::DoubleTyID: {
- const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+ const auto *CFP = cast<ConstantFP>(CPV);
Type *Ty = CFP->getType();
if (Ty == Type::getHalfTy(CPV->getContext())) {
APInt API = CFP->getValueAPF().bitcastToAPInt();
@@ -2212,7 +2210,7 @@ void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
const MachineOperand &MO = MI->getOperand(opNum);
switch (MO.getType()) {
case MachineOperand::MO_Register:
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ if (Register::isPhysicalRegister(MO.getReg())) {
if (MO.getReg() == NVPTX::VRDepot)
O << DEPOTNAME << getFunctionNumber();
else
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 43ae57ac1262..7a66854d32f4 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -200,7 +200,7 @@ private:
const Function *F;
std::string CurrentFnName;
- void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
+ void EmitBasicBlockStart(const MachineBasicBlock &MBB) override;
void EmitFunctionEntryLabel() override;
void EmitFunctionBodyStart() override;
void EmitFunctionBodyEnd() override;
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 46f08b23d31a..d26912f47e50 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -25,7 +25,7 @@
using namespace llvm;
NVPTXFrameLowering::NVPTXFrameLowering()
- : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0) {}
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, Align(8), 0) {}
bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ae1aa98da0e8..9acd0bea66fd 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -480,7 +480,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::TRAP, MVT::Other, Legal);
// Register custom handling for vector loads/stores
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
if (IsPTXVectorType(VT)) {
setOperationAction(ISD::LOAD, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
@@ -1291,8 +1291,8 @@ std::string NVPTXTargetLowering::getPrototype(
O << ".param .b" << size << " _";
} else if (isa<PointerType>(retTy)) {
O << ".param .b" << PtrVT.getSizeInBits() << " _";
- } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) {
- auto &DL = CS.getCalledFunction()->getParent()->getDataLayout();
+ } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
+ retTy->isIntegerTy(128)) {
O << ".param .align " << retAlignment << " .b8 _["
<< DL.getTypeAllocSize(retTy) << "]";
} else {
@@ -2230,8 +2230,8 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType() == MVT::v2f16) {
LoadSDNode *Load = cast<LoadSDNode>(Op);
EVT MemVT = Load->getMemoryVT();
- if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
- *Load->getMemOperand())) {
+ if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ MemVT, *Load->getMemOperand())) {
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues(Ops, SDLoc(Op));
@@ -2273,8 +2273,8 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// v2f16 is legal, so we can't rely on legalizer to handle unaligned
// stores and have to handle it here.
if (VT == MVT::v2f16 &&
- !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- *Store->getMemOperand()))
+ !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ VT, *Store->getMemOperand()))
return expandUnalignedStore(Store, DAG);
if (VT.isVector())
@@ -3497,7 +3497,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
@@ -3521,7 +3521,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 8;
+ Info.align = Align(8);
return true;
}
@@ -3547,7 +3547,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3585,7 +3585,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 4;
+ Info.align = Align(4);
return true;
}
@@ -3606,7 +3606,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3627,7 +3627,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3648,7 +3648,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3665,7 +3665,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 8;
+ Info.align = Align(8);
return true;
}
@@ -3686,7 +3686,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOStore;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3707,7 +3707,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOStore;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3728,7 +3728,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOStore;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3745,7 +3745,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOStore;
- Info.align = 8;
+ Info.align = Align(8);
return true;
}
@@ -3780,7 +3780,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
- Info.align = 0;
+ Info.align.reset();
return true;
}
@@ -3798,7 +3798,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+ Info.align =
+ MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
return true;
}
@@ -3817,7 +3818,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+ Info.align =
+ MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
return true;
}
@@ -3883,7 +3885,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = nullptr;
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
case Intrinsic::nvvm_tex_1d_v4s32_s32:
@@ -4003,7 +4005,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = nullptr;
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
case Intrinsic::nvvm_suld_1d_i8_clamp:
@@ -4056,7 +4058,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = nullptr;
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
case Intrinsic::nvvm_suld_1d_i16_clamp:
@@ -4109,7 +4111,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = nullptr;
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
case Intrinsic::nvvm_suld_1d_i32_clamp:
@@ -4162,7 +4164,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = nullptr;
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
case Intrinsic::nvvm_suld_1d_i64_clamp:
@@ -4200,7 +4202,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = nullptr;
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
return false;
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 62da3c79f465..fe7a84f9a361 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -143,12 +143,17 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
+def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
+// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
+def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
+ "&& Subtarget->getPTXVersion() >= 64)">;
+
def useShortPtr : Predicate<"useShortPointers()">;
def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
@@ -2908,7 +2913,7 @@ def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
// truncating back down to 32 bits.
-def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
@@ -2925,10 +2930,10 @@ def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
// and then ctlz that value. This way we don't have to subtract 16 from the
// result. Unfortunately today we don't have a way to generate
// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
-def : Pat<(ctlz Int16Regs:$a),
+def : Pat<(i16 (ctlz Int16Regs:$a)),
(SUBi16ri (CVT_u16_u32
(CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
-def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
+def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
(SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
// Population count
@@ -2953,7 +2958,7 @@ def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
// If we know that we're storing into an i32, we can avoid the final trunc.
def : Pat<(ctpop Int16Regs:$a),
(CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
-def : Pat<(i32 (zext (ctpop Int16Regs:$a))),
+def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
(POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
// fpround f32 -> f16
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 1752d3e0575e..c52195fb0449 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -56,6 +56,10 @@ class RegSeq<int n, string prefix> {
[]);
}
+class THREADMASK_INFO<bit sync> {
+ list<bit> ret = !if(sync, [0,1], [0]);
+}
+
//-----------------------------------
// Synchronization and shuffle functions
//-----------------------------------
@@ -129,121 +133,64 @@ def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
[(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
Requires<[hasPTX60, hasSM30]>;
-
-// shfl.{up,down,bfly,idx}.b32
-multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
- // The last two parameters to shfl can be regs or imms. ptxas is smart
- // enough to inline constant registers, so strictly speaking we don't need to
- // handle immediates here. But it's easy enough, and it makes our ptx more
- // readable.
- def reg : NVPTXInst<
- (outs regclass:$dst),
- (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
- !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
- [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>;
-
- def imm1 : NVPTXInst<
- (outs regclass:$dst),
- (ins regclass:$src, i32imm:$offset, Int32Regs:$mask),
- !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
- [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>;
-
- def imm2 : NVPTXInst<
- (outs regclass:$dst),
- (ins regclass:$src, Int32Regs:$offset, i32imm:$mask),
- !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
- [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>;
-
- def imm3 : NVPTXInst<
- (outs regclass:$dst),
- (ins regclass:$src, i32imm:$offset, i32imm:$mask),
- !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
- [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>;
+class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
+ bit offset_imm, bit mask_imm, bit threadmask_imm>
+ : NVPTXInst<(outs), (ins), "?", []> {
+ NVPTXRegClass rc = !cond(
+ !eq(reg, "i32"): Int32Regs,
+ !eq(reg, "f32"): Float32Regs);
+ string IntrName = "int_nvvm_shfl_"
+ # !if(sync, "sync_", "")
+ # mode
+ # "_" # reg
+ # !if(return_pred, "p", "");
+ Intrinsic Intr = !cast<Intrinsic>(IntrName);
+ let InOperandList = !con(
+ !if(sync,
+ !dag(ins, !if(threadmask_imm, [i32imm], [Int32Regs]), ["threadmask"]),
+ (ins)),
+ (ins rc:$src),
+ !dag(ins, !if(offset_imm, [i32imm], [Int32Regs]), ["offset"]),
+ !dag(ins, !if(mask_imm, [i32imm], [Int32Regs]), ["mask"])
+ );
+ let OutOperandList = !if(return_pred, (outs rc:$dst, Int1Regs:$pred), (outs rc:$dst));
+ let AsmString = "shfl."
+ # !if(sync, "sync.", "")
+ # mode # ".b32\t"
+ # "$dst"
+ # !if(return_pred, "|$pred", "") # ", "
+ # "$src, $offset, $mask"
+ # !if(sync, ", $threadmask", "")
+ # ";"
+ ;
+ let Pattern = [!con(
+ !foreach(tmp, OutOperandList,
+ !subst(outs, set,
+ !subst(i32imm, imm, tmp))),
+ (set !foreach(tmp, InOperandList,
+ !subst(ins, Intr,
+ !subst(i32imm, imm, tmp))))
+ )];
}
-defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_nvvm_shfl_down_i32>;
-defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_nvvm_shfl_down_f32>;
-defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_nvvm_shfl_up_i32>;
-defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_nvvm_shfl_up_f32>;
-defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_nvvm_shfl_bfly_i32>;
-defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>;
-defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>;
-defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>;
-
-multiclass SHFL_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
- // Threadmask and the last two parameters to shfl.sync can be regs or imms.
- // ptxas is smart enough to inline constant registers, so strictly speaking we
- // don't need to handle immediates here. But it's easy enough, and it makes
- // our ptx more readable.
- def rrr : NVPTXInst<
- (outs regclass:$dst),
- (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
- Int32Regs:$offset, Int32Regs:$mask))]>;
-
- def rri : NVPTXInst<
- (outs regclass:$dst),
- (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
- Int32Regs:$offset, imm:$mask))]>;
-
- def rir : NVPTXInst<
- (outs regclass:$dst),
- (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
- imm:$offset, Int32Regs:$mask))]>;
-
- def rii : NVPTXInst<
- (outs regclass:$dst),
- (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
- imm:$offset, imm:$mask))]>;
-
- def irr : NVPTXInst<
- (outs regclass:$dst),
- (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
- Int32Regs:$offset, Int32Regs:$mask))]>;
-
- def iri : NVPTXInst<
- (outs regclass:$dst),
- (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
- Int32Regs:$offset, imm:$mask))]>;
-
- def iir : NVPTXInst<
- (outs regclass:$dst),
- (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
- imm:$offset, Int32Regs:$mask))]>;
-
- def iii : NVPTXInst<
- (outs regclass:$dst),
- (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
- imm:$offset, imm:$mask))]>;
+foreach sync = [0, 1] in {
+ foreach mode = ["up", "down", "bfly", "idx"] in {
+ foreach regclass = ["i32", "f32"] in {
+ foreach return_pred = [0, 1] in {
+ foreach offset_imm = [0, 1] in {
+ foreach mask_imm = [0, 1] in {
+ foreach threadmask_imm = THREADMASK_INFO<sync>.ret in {
+ def : SHFL_INSTR<sync, mode, regclass, return_pred,
+ offset_imm, mask_imm, threadmask_imm>,
+ Requires<!if(sync, [hasSM30], [hasSM30, hasSHFL])>;
+ }
+ }
+ }
+ }
+ }
+ }
}
-// On sm_70 these don't have to be convergent, so we may eventually want to
-// implement non-convergent variant of this intrinsic.
-defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC<Int32Regs, "down", int_nvvm_shfl_sync_down_i32>;
-defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC<Float32Regs, "down", int_nvvm_shfl_sync_down_f32>;
-defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC<Int32Regs, "up", int_nvvm_shfl_sync_up_i32>;
-defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC<Float32Regs, "up", int_nvvm_shfl_sync_up_f32>;
-defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC<Int32Regs, "bfly", int_nvvm_shfl_sync_bfly_i32>;
-defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC<Float32Regs, "bfly", int_nvvm_shfl_sync_bfly_f32>;
-defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC<Int32Regs, "idx", int_nvvm_shfl_sync_idx_i32>;
-defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC<Float32Regs, "idx", int_nvvm_shfl_sync_idx_f32>;
-
-
// vote.{all,any,uni,ballot}
multiclass VOTE<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
def : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred),
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 0743a2986718..83039241a7c7 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -103,7 +103,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
// Do the transformation of an aggr load/copy/set to a loop
//
for (LoadInst *LI : AggrLoads) {
- StoreInst *SI = dyn_cast<StoreInst>(*LI->user_begin());
+ auto *SI = cast<StoreInst>(*LI->user_begin());
Value *SrcAddr = LI->getOperand(0);
Value *DstAddr = SI->getOperand(1);
unsigned NumLoads = DL.getTypeStoreSize(LI->getType());
diff --git a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
index 76fb9f3fa692..945b7286b03c 100644
--- a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -41,12 +41,12 @@ void initializeNVPTXLowerAllocaPass(PassRegistry &);
}
namespace {
-class NVPTXLowerAlloca : public BasicBlockPass {
- bool runOnBasicBlock(BasicBlock &BB) override;
+class NVPTXLowerAlloca : public FunctionPass {
+ bool runOnFunction(Function &F) override;
public:
static char ID; // Pass identification, replacement for typeid
- NVPTXLowerAlloca() : BasicBlockPass(ID) {}
+ NVPTXLowerAlloca() : FunctionPass(ID) {}
StringRef getPassName() const override {
return "convert address space of alloca'ed memory to local";
}
@@ -61,58 +61,61 @@ INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca",
// =============================================================================
// Main function for this pass.
// =============================================================================
-bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) {
- if (skipBasicBlock(BB))
+bool NVPTXLowerAlloca::runOnFunction(Function &F) {
+ if (skipFunction(F))
return false;
bool Changed = false;
- for (auto &I : BB) {
- if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
- Changed = true;
- auto PTy = dyn_cast<PointerType>(allocaInst->getType());
- auto ETy = PTy->getElementType();
- auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
- auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
- auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
- auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal,
- GenericAddrTy, "");
- NewASCToLocal->insertAfter(allocaInst);
- NewASCToGeneric->insertAfter(NewASCToLocal);
- for (Value::use_iterator UI = allocaInst->use_begin(),
- UE = allocaInst->use_end();
- UI != UE; ) {
- // Check Load, Store, GEP, and BitCast Uses on alloca and make them
- // use the converted generic address, in order to expose non-generic
- // addrspacecast to NVPTXInferAddressSpaces. For other types
- // of instructions this is unnecessary and may introduce redundant
- // address cast.
- const auto &AllocaUse = *UI++;
- auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
- if (LI && LI->getPointerOperand() == allocaInst && !LI->isVolatile()) {
- LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric);
- continue;
- }
- auto SI = dyn_cast<StoreInst>(AllocaUse.getUser());
- if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) {
- SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric);
- continue;
- }
- auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser());
- if (GI && GI->getPointerOperand() == allocaInst) {
- GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric);
- continue;
- }
- auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser());
- if (BI && BI->getOperand(0) == allocaInst) {
- BI->setOperand(0, NewASCToGeneric);
- continue;
+ for (auto &BB : F)
+ for (auto &I : BB) {
+ if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
+ Changed = true;
+ auto PTy = dyn_cast<PointerType>(allocaInst->getType());
+ auto ETy = PTy->getElementType();
+ auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
+ auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
+ auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
+ auto NewASCToGeneric =
+ new AddrSpaceCastInst(NewASCToLocal, GenericAddrTy, "");
+ NewASCToLocal->insertAfter(allocaInst);
+ NewASCToGeneric->insertAfter(NewASCToLocal);
+ for (Value::use_iterator UI = allocaInst->use_begin(),
+ UE = allocaInst->use_end();
+ UI != UE;) {
+ // Check Load, Store, GEP, and BitCast Uses on alloca and make them
+ // use the converted generic address, in order to expose non-generic
+ // addrspacecast to NVPTXInferAddressSpaces. For other types
+ // of instructions this is unnecessary and may introduce redundant
+ // address cast.
+ const auto &AllocaUse = *UI++;
+ auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
+ if (LI && LI->getPointerOperand() == allocaInst &&
+ !LI->isVolatile()) {
+ LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric);
+ continue;
+ }
+ auto SI = dyn_cast<StoreInst>(AllocaUse.getUser());
+ if (SI && SI->getPointerOperand() == allocaInst &&
+ !SI->isVolatile()) {
+ SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric);
+ continue;
+ }
+ auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser());
+ if (GI && GI->getPointerOperand() == allocaInst) {
+ GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric);
+ continue;
+ }
+ auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser());
+ if (BI && BI->getOperand(0) == allocaInst) {
+ BI->setOperand(0, NewASCToGeneric);
+ continue;
+ }
}
}
}
- }
return Changed;
}
-BasicBlockPass *llvm::createNVPTXLowerAllocaPass() {
+FunctionPass *llvm::createNVPTXLowerAllocaPass() {
return new NVPTXLowerAlloca();
}
diff --git a/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index c5e02e34e25e..c3c5f6fbcba7 100644
--- a/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -164,7 +164,7 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
// Set the alignment to alignment of the byval parameter. This is because,
// later load/stores assume that alignment, and we are going to replace
// the use of the byval parameter with this alloca instruction.
- AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo()));
+ AllocA->setAlignment(MaybeAlign(Func->getParamAlignment(Arg->getArgNo())));
Arg->replaceAllUsesWith(AllocA);
Value *ArgInParam = new AddrSpaceCastInst(
diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp
index 629757db8707..5e6411c61eab 100644
--- a/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -81,7 +81,7 @@ static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) {
auto &Op = Root.getOperand(1);
const auto &MRI = MF.getRegInfo();
MachineInstr *GenericAddrDef = nullptr;
- if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+ if (Op.isReg() && Register::isVirtualRegister(Op.getReg())) {
GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg());
}
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 4c5a9adf1f65..a7127b0e9a99 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -178,7 +178,7 @@ NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
// frame index registers. Functions which don't want/need this optimization
// will continue to use the existing code path.
if (MFI.getUseLocalStackAllocationBlock()) {
- unsigned Align = MFI.getLocalFrameMaxAlign();
+ unsigned Align = MFI.getLocalFrameMaxAlign().value();
// Adjust to alignment boundary.
Offset = (Offset + Align - 1) / Align * Align;
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 11b3fe2fa3d3..f58fb5717773 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -116,7 +116,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
CPU, FS, Options, Reloc::PIC_,
getEffectiveCodeModel(CM, CodeModel::Small), OL),
is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
- TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
+ TLOF(std::make_unique<NVPTXTargetObjectFile>()),
Subtarget(TT, CPU, FS, *this) {
if (TT.getOS() == Triple::NVCL)
drvInterface = NVPTX::NVCL;
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index 665eb1383253..43c2e9920403 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -19,10 +19,11 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MutexGuard.h"
+#include "llvm/Support/Mutex.h"
#include <algorithm>
#include <cstring>
#include <map>
+#include <mutex>
#include <string>
#include <vector>
@@ -38,12 +39,12 @@ static ManagedStatic<per_module_annot_t> annotationCache;
static sys::Mutex Lock;
void clearAnnotationCache(const Module *Mod) {
- MutexGuard Guard(Lock);
+ std::lock_guard<sys::Mutex> Guard(Lock);
annotationCache->erase(Mod);
}
static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
- MutexGuard Guard(Lock);
+ std::lock_guard<sys::Mutex> Guard(Lock);
assert(md && "Invalid mdnode for annotation");
assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
// start index = 1, to skip the global variable key
@@ -69,7 +70,7 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
}
static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
- MutexGuard Guard(Lock);
+ std::lock_guard<sys::Mutex> Guard(Lock);
NamedMDNode *NMD = m->getNamedMetadata("nvvm.annotations");
if (!NMD)
return;
@@ -103,7 +104,7 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
unsigned &retval) {
- MutexGuard Guard(Lock);
+ std::lock_guard<sys::Mutex> Guard(Lock);
const Module *m = gv->getParent();
if ((*annotationCache).find(m) == (*annotationCache).end())
cacheAnnotationFromMD(m, gv);
@@ -117,7 +118,7 @@ bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
std::vector<unsigned> &retval) {
- MutexGuard Guard(Lock);
+ std::lock_guard<sys::Mutex> Guard(Lock);
const Module *m = gv->getParent();
if ((*annotationCache).find(m) == (*annotationCache).end())
cacheAnnotationFromMD(m, gv);
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index c9524da93acd..aedf5b713c3f 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -579,7 +579,7 @@ public:
static std::unique_ptr<PPCOperand> CreateToken(StringRef Str, SMLoc S,
bool IsPPC64) {
- auto Op = make_unique<PPCOperand>(Token);
+ auto Op = std::make_unique<PPCOperand>(Token);
Op->Tok.Data = Str.data();
Op->Tok.Length = Str.size();
Op->StartLoc = S;
@@ -608,7 +608,7 @@ public:
static std::unique_ptr<PPCOperand> CreateImm(int64_t Val, SMLoc S, SMLoc E,
bool IsPPC64) {
- auto Op = make_unique<PPCOperand>(Immediate);
+ auto Op = std::make_unique<PPCOperand>(Immediate);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -618,7 +618,7 @@ public:
static std::unique_ptr<PPCOperand> CreateExpr(const MCExpr *Val, SMLoc S,
SMLoc E, bool IsPPC64) {
- auto Op = make_unique<PPCOperand>(Expression);
+ auto Op = std::make_unique<PPCOperand>(Expression);
Op->Expr.Val = Val;
Op->Expr.CRVal = EvaluateCRExpr(Val);
Op->StartLoc = S;
@@ -629,7 +629,7 @@ public:
static std::unique_ptr<PPCOperand>
CreateTLSReg(const MCSymbolRefExpr *Sym, SMLoc S, SMLoc E, bool IsPPC64) {
- auto Op = make_unique<PPCOperand>(TLSRegister);
+ auto Op = std::make_unique<PPCOperand>(TLSRegister);
Op->TLSReg.Sym = Sym;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -639,7 +639,7 @@ public:
static std::unique_ptr<PPCOperand>
CreateContextImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) {
- auto Op = make_unique<PPCOperand>(ContextImmediate);
+ auto Op = std::make_unique<PPCOperand>(ContextImmediate);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 7a8af57961cb..3597fd15eeb1 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -167,12 +167,6 @@ static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
return decodeRegisterClass(Inst, RegNo, QFRegs);
}
-static DecodeStatus DecodeSPE4RCRegisterClass(MCInst &Inst, uint64_t RegNo,
- uint64_t Address,
- const void *Decoder) {
- return decodeRegisterClass(Inst, RegNo, RRegs);
-}
-
static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 042ddf48d5df..20f752c3041a 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -78,7 +78,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
// determine the type of the relocation
unsigned Type;
if (IsPCRel) {
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default:
llvm_unreachable("Unimplemented");
case PPC::fixup_ppc_br24:
@@ -131,7 +131,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
break;
}
} else {
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default: llvm_unreachable("invalid fixup kind!");
case FK_NONE:
Type = ELF::R_PPC_NONE;
@@ -443,5 +443,5 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
std::unique_ptr<MCObjectTargetWriter>
llvm::createPPCELFObjectWriter(bool Is64Bit, uint8_t OSABI) {
- return llvm::make_unique<PPCELFObjectWriter>(Is64Bit, OSABI);
+ return std::make_unique<PPCELFObjectWriter>(Is64Bit, OSABI);
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index 0e64ae55ab1c..7fc231618fa9 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -66,6 +66,31 @@ void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
StringRef Annot, const MCSubtargetInfo &STI) {
+ // Customize printing of the addis instruction on AIX. When an operand is a
+ // symbol reference, the instruction syntax is changed to look like a load
+ // operation, i.e:
+ // Transform: addis $rD, $rA, $src --> addis $rD, $src($rA).
+ if (TT.isOSAIX() &&
+ (MI->getOpcode() == PPC::ADDIS8 || MI->getOpcode() == PPC::ADDIS) &&
+ MI->getOperand(2).isExpr()) {
+ assert((MI->getOperand(0).isReg() && MI->getOperand(1).isReg()) &&
+ "The first and the second operand of an addis instruction"
+ " should be registers.");
+
+ assert(isa<MCSymbolRefExpr>(MI->getOperand(2).getExpr()) &&
+ "The third operand of an addis instruction should be a symbol "
+ "reference expression if it is an expression at all.");
+
+ O << "\taddis ";
+ printOperand(MI, 0, O);
+ O << ", ";
+ printOperand(MI, 2, O);
+ O << "(";
+ printOperand(MI, 1, O);
+ O << ")";
+ return;
+ }
+
// Check for slwi/srwi mnemonics.
if (MI->getOpcode() == PPC::RLWINM) {
unsigned char SH = MI->getOperand(2).getImm();
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 5f0005ea1d7b..1216cd727289 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -86,4 +86,5 @@ void PPCXCOFFMCAsmInfo::anchor() {}
PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
assert(!IsLittleEndian && "Little-endian XCOFF not supported.");
CodePointerSize = CalleeSaveStackSlotSize = Is64Bit ? 8 : 4;
+ ZeroDirective = "\t.space\t";
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index d467f5c4a439..fb9dd5d7aa75 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -19,8 +19,8 @@ using namespace llvm;
const PPCMCExpr*
PPCMCExpr::create(VariantKind Kind, const MCExpr *Expr,
- bool isDarwin, MCContext &Ctx) {
- return new (Ctx) PPCMCExpr(Kind, Expr, isDarwin);
+ bool IsDarwin, MCContext &Ctx) {
+ return new (Ctx) PPCMCExpr(Kind, Expr, IsDarwin);
}
void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index 449e2c34f74d..ad1454566162 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -45,21 +45,21 @@ public:
/// @{
static const PPCMCExpr *create(VariantKind Kind, const MCExpr *Expr,
- bool isDarwin, MCContext &Ctx);
+ bool IsDarwin, MCContext &Ctx);
static const PPCMCExpr *createLo(const MCExpr *Expr,
- bool isDarwin, MCContext &Ctx) {
- return create(VK_PPC_LO, Expr, isDarwin, Ctx);
+ bool IsDarwin, MCContext &Ctx) {
+ return create(VK_PPC_LO, Expr, IsDarwin, Ctx);
}
static const PPCMCExpr *createHi(const MCExpr *Expr,
- bool isDarwin, MCContext &Ctx) {
- return create(VK_PPC_HI, Expr, isDarwin, Ctx);
+ bool IsDarwin, MCContext &Ctx) {
+ return create(VK_PPC_HI, Expr, IsDarwin, Ctx);
}
static const PPCMCExpr *createHa(const MCExpr *Expr,
- bool isDarwin, MCContext &Ctx) {
- return create(VK_PPC_HA, Expr, isDarwin, Ctx);
+ bool IsDarwin, MCContext &Ctx) {
+ return create(VK_PPC_HA, Expr, IsDarwin, Ctx);
}
/// @}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index 4cf7fd15fa75..672f910ab086 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -178,7 +178,7 @@ static uint32_t getFixupOffset(const MCAsmLayout &Layout,
uint32_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
// On Mach-O, ppc_fixup_half16 relocations must refer to the
// start of the instruction, not the second halfword, as ELF does
- if (unsigned(Fixup.getKind()) == PPC::fixup_ppc_half16)
+ if (Fixup.getTargetKind() == PPC::fixup_ppc_half16)
FixupOffset &= ~uint32_t(3);
return FixupOffset;
}
@@ -376,5 +376,5 @@ void PPCMachObjectWriter::RecordPPCRelocation(
std::unique_ptr<MCObjectTargetWriter>
llvm::createPPCMachObjectWriter(bool Is64Bit, uint32_t CPUType,
uint32_t CPUSubtype) {
- return llvm::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
+ return std::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index 9c661286d455..7fdbb8990b55 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -25,5 +25,5 @@ PPCXCOFFObjectWriter::PPCXCOFFObjectWriter(bool Is64Bit)
std::unique_ptr<MCObjectTargetWriter>
llvm::createPPCXCOFFObjectWriter(bool Is64Bit) {
- return llvm::make_unique<PPCXCOFFObjectWriter>(Is64Bit);
+ return std::make_unique<PPCXCOFFObjectWriter>(Is64Bit);
}
diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td
index 2a10322d3f49..f6cd8ed00c82 100644
--- a/lib/Target/PowerPC/P9InstrResources.td
+++ b/lib/Target/PowerPC/P9InstrResources.td
@@ -64,6 +64,7 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
XXLAND,
XXLANDC,
XXLEQV,
+ XXLEQVOnes,
XXLNAND,
XXLNOR,
XXLOR,
@@ -124,8 +125,8 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C],
(instregex "SRAD(I)?$"),
(instregex "EXTSWSLI_32_64$"),
(instregex "MFV(S)?RD$"),
- (instregex "MTVSRD$"),
- (instregex "MTVSRW(A|Z)$"),
+ (instregex "MTV(S)?RD$"),
+ (instregex "MTV(S)?RW(A|Z)$"),
(instregex "CMP(WI|LWI|W|LW)(8)?$"),
(instregex "CMP(L)?D(I)?$"),
(instregex "SUBF(I)?C(8)?$"),
@@ -148,7 +149,7 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C],
(instregex "EXTS(B|H|W)(8)?(_32)?(_64)?(o)?$"),
(instregex "ADD(4|8)(TLS)?(_)?$"),
(instregex "NEG(8)?$"),
- (instregex "ADDI(S)?toc(HA|L)$"),
+ (instregex "ADDI(S)?toc(HA|L)(8)?$"),
COPY,
MCRF,
MCRXRX,
@@ -158,6 +159,7 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C],
XSNEGDP,
XSCPSGNDP,
MFVSRWZ,
+ MFVRWZ,
EXTSWSLI,
SRADI_32,
RLDIC,
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index c6951ab67b08..0534773c4c9e 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -50,10 +50,10 @@ namespace llvm {
FunctionPass *createPPCExpandISELPass();
FunctionPass *createPPCPreEmitPeepholePass();
void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
- AsmPrinter &AP, bool isDarwin);
+ AsmPrinter &AP, bool IsDarwin);
bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
MCOperand &OutMO, AsmPrinter &AP,
- bool isDarwin);
+ bool IsDarwin);
void initializePPCCTRLoopsPass(PassRegistry&);
#ifndef NDEBUG
@@ -86,8 +86,8 @@ namespace llvm {
MO_NO_FLAG,
/// On a symbol operand "FOO", this indicates that the reference is actually
- /// to "FOO@plt". This is used for calls and jumps to external functions on
- /// for PIC calls on Linux and ELF systems.
+ /// to "FOO@plt". This is used for calls and jumps to external functions
+ /// and for PIC calls on 32-bit ELF systems.
MO_PLT = 1,
/// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index bd87ce06b4fb..66236b72a1a3 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -51,9 +51,11 @@
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionXCOFF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSymbolXCOFF.h"
#include "llvm/MC/SectionKind.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
@@ -76,7 +78,7 @@ namespace {
class PPCAsmPrinter : public AsmPrinter {
protected:
- MapVector<MCSymbol *, MCSymbol *> TOC;
+ MapVector<const MCSymbol *, MCSymbol *> TOC;
const PPCSubtarget *Subtarget;
StackMaps SM;
@@ -87,7 +89,7 @@ public:
StringRef getPassName() const override { return "PowerPC Assembly Printer"; }
- MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
+ MCSymbol *lookUpOrCreateTOCEntry(const MCSymbol *Sym);
bool doInitialization(Module &M) override {
if (!TOC.empty())
@@ -164,6 +166,14 @@ public:
: PPCAsmPrinter(TM, std::move(Streamer)) {}
StringRef getPassName() const override { return "AIX PPC Assembly Printer"; }
+
+ void SetupMachineFunction(MachineFunction &MF) override;
+
+ void EmitGlobalVariable(const GlobalVariable *GV) override;
+
+ void EmitFunctionDescriptor() override;
+
+ void EmitEndOfAsmFile(Module &) override;
};
} // end anonymous namespace
@@ -265,7 +275,7 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
return true;
// This operand uses VSX numbering.
// If the operand is a VMX register, convert it to a VSX register.
- unsigned Reg = MI->getOperand(OpNo).getReg();
+ Register Reg = MI->getOperand(OpNo).getReg();
if (PPCInstrInfo::isVRRegister(Reg))
Reg = PPC::VSX32 + (Reg - PPC::V0);
else if (PPCInstrInfo::isVFRegister(Reg))
@@ -328,7 +338,7 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
/// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
/// exists for it. If not, create one. Then return a symbol that references
/// the TOC entry.
-MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
+MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(const MCSymbol *Sym) {
MCSymbol *&TOCEntry = TOC[Sym];
if (!TOCEntry)
TOCEntry = createTempSymbol("C");
@@ -378,7 +388,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) {
if (CallTarget) {
assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
"High 16 bits of call target should be zero.");
- unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+ Register ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
EncodedBytes = 0;
// Materialize the jump address:
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI8)
@@ -502,13 +512,32 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
.addExpr(SymVar));
}
+/// Map a machine operand for a TOC pseudo-machine instruction to its
+/// corresponding MCSymbol.
+static MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO,
+ AsmPrinter &AP) {
+ switch (MO.getType()) {
+ case MachineOperand::MO_GlobalAddress:
+ return AP.getSymbol(MO.getGlobal());
+ case MachineOperand::MO_ConstantPoolIndex:
+ return AP.GetCPISymbol(MO.getIndex());
+ case MachineOperand::MO_JumpTableIndex:
+ return AP.GetJTISymbol(MO.getIndex());
+ case MachineOperand::MO_BlockAddress:
+ return AP.GetBlockAddressSymbol(MO.getBlockAddress());
+ default:
+ llvm_unreachable("Unexpected operand type to get symbol.");
+ }
+}
+
/// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to
/// the current output stream.
///
void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCInst TmpInst;
- bool isPPC64 = Subtarget->isPPC64();
- bool isDarwin = TM.getTargetTriple().isOSDarwin();
+ const bool IsDarwin = TM.getTargetTriple().isOSDarwin();
+ const bool IsPPC64 = Subtarget->isPPC64();
+ const bool IsAIX = Subtarget->isAIXABI();
const Module *M = MF->getFunction().getParent();
PICLevel::Level PL = M->getPICLevel();
@@ -517,7 +546,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
if (!MI->isInlineAsm()) {
for (const MachineOperand &MO: MI->operands()) {
if (MO.isReg()) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Subtarget->hasSPE()) {
if (PPC::F4RCRegClass.contains(Reg) ||
PPC::F8RCRegClass.contains(Reg) ||
@@ -595,7 +624,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// addis r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@ha
// addi r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@l
// Get the offset from the GOT Base Register to the GOT
- LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
if (Subtarget->isSecurePlt() && isPositionIndependent() ) {
unsigned PICR = TmpInst.getOperand(0).getReg();
MCSymbol *BaseSymbol = OutContext.getOrCreateSymbol(
@@ -646,43 +675,57 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
}
case PPC::LWZtoc: {
- // Transform %r3 = LWZtoc @min1, %r2
- LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+ assert(!IsDarwin && "TOC is an ELF/XCOFF construct.");
+
+ // Transform %rN = LWZtoc @op1, %r2
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
- // Change the opcode to LWZ, and the global address operand to be a
- // reference to the GOT entry we will synthesize later.
+ // Change the opcode to LWZ.
TmpInst.setOpcode(PPC::LWZ);
+
const MachineOperand &MO = MI->getOperand(1);
+ assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+ "Invalid operand for LWZtoc.");
- // Map symbol -> label of TOC entry
- assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress());
- MCSymbol *MOSymbol = nullptr;
- if (MO.isGlobal())
- MOSymbol = getSymbol(MO.getGlobal());
- else if (MO.isCPI())
- MOSymbol = GetCPISymbol(MO.getIndex());
- else if (MO.isJTI())
- MOSymbol = GetJTISymbol(MO.getIndex());
- else if (MO.isBlockAddress())
- MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
-
- if (PL == PICLevel::SmallPIC) {
+ // Map the operand to its corresponding MCSymbol.
+ const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+
+ // Create a reference to the GOT entry for the symbol. The GOT entry will be
+ // synthesized later.
+ if (PL == PICLevel::SmallPIC && !IsAIX) {
const MCExpr *Exp =
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_GOT,
OutContext);
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
- } else {
- MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
- const MCExpr *Exp =
- MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_None,
- OutContext);
- const MCExpr *PB =
- MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")),
- OutContext);
- Exp = MCBinaryExpr::createSub(Exp, PB, OutContext);
+ // Otherwise, use the TOC. 'TOCEntry' is a label used to reference the
+ // storage allocated in the TOC which contains the address of
+ // 'MOSymbol'. Said TOC entry will be synthesized later.
+ MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+ const MCExpr *Exp =
+ MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_None, OutContext);
+
+ // AIX uses the label directly as the lwz displacement operand for
+ // references into the toc section. The displacement value will be generated
+ // relative to the toc-base.
+ if (IsAIX) {
+ assert(
+ TM.getCodeModel() == CodeModel::Small &&
+ "This pseudo should only be selected for 32-bit small code model.");
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
}
+
+ // Create an explicit subtract expression between the local symbol and
+ // '.LTOC' to manifest the toc-relative offset.
+ const MCExpr *PB = MCSymbolRefExpr::create(
+ OutContext.getOrCreateSymbol(Twine(".LTOC")), OutContext);
+ Exp = MCBinaryExpr::createSub(Exp, PB, OutContext);
+ TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
@@ -690,72 +733,121 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
case PPC::LDtocCPT:
case PPC::LDtocBA:
case PPC::LDtoc: {
+ assert(!IsDarwin && "TOC is an ELF/XCOFF construct");
+
// Transform %x3 = LDtoc @min1, %x2
- LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
- // Change the opcode to LD, and the global address operand to be a
- // reference to the TOC entry we will synthesize later.
+ // Change the opcode to LD.
TmpInst.setOpcode(PPC::LD);
+
const MachineOperand &MO = MI->getOperand(1);
+ assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+ "Invalid operand!");
+
+ // Map the machine operand to its corresponding MCSymbol, then map the
+ // global address operand to be a reference to the TOC entry we will
+ // synthesize later.
+ MCSymbol *TOCEntry =
+ lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO, *this));
+
+ const MCSymbolRefExpr::VariantKind VK =
+ IsAIX ? MCSymbolRefExpr::VK_None : MCSymbolRefExpr::VK_PPC_TOC;
+ const MCExpr *Exp =
+ MCSymbolRefExpr::create(TOCEntry, VK, OutContext);
+ TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case PPC::ADDIStocHA: {
+ assert((IsAIX && !IsPPC64 && TM.getCodeModel() == CodeModel::Large) &&
+ "This pseudo should only be selected for 32-bit large code model on"
+ " AIX.");
+
+ // Transform %rd = ADDIStocHA %rA, @sym(%r2)
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
- // Map symbol -> label of TOC entry
- assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress());
- MCSymbol *MOSymbol = nullptr;
- if (MO.isGlobal())
- MOSymbol = getSymbol(MO.getGlobal());
- else if (MO.isCPI())
- MOSymbol = GetCPISymbol(MO.getIndex());
- else if (MO.isJTI())
- MOSymbol = GetJTISymbol(MO.getIndex());
- else if (MO.isBlockAddress())
- MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+ // Change the opcode to ADDIS.
+ TmpInst.setOpcode(PPC::ADDIS);
+ const MachineOperand &MO = MI->getOperand(2);
+ assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+ "Invalid operand for ADDIStocHA.");
+
+ // Map the machine operand to its corresponding MCSymbol.
+ MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+
+ // Always use TOC on AIX. Map the global address operand to be a reference
+ // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to
+ // reference the storage allocated in the TOC which contains the address of
+ // 'MOSymbol'.
MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+ const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry,
+ MCSymbolRefExpr::VK_PPC_U,
+ OutContext);
+ TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
+ EmitToStreamer(*OutStreamer, TmpInst);
+ return;
+ }
+ case PPC::LWZtocL: {
+ assert(IsAIX && !IsPPC64 && TM.getCodeModel() == CodeModel::Large &&
+ "This pseudo should only be selected for 32-bit large code model on"
+ " AIX.");
- const MCExpr *Exp =
- MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
- OutContext);
+ // Transform %rd = LWZtocL @sym, %rs.
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
+
+ // Change the opcode to lwz.
+ TmpInst.setOpcode(PPC::LWZ);
+
+ const MachineOperand &MO = MI->getOperand(1);
+ assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+ "Invalid operand for LWZtocL.");
+
+ // Map the machine operand to its corresponding MCSymbol.
+ MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+
+ // Always use TOC on AIX. Map the global address operand to be a reference
+ // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to
+ // reference the storage allocated in the TOC which contains the address of
+ // 'MOSymbol'.
+ MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+ const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry,
+ MCSymbolRefExpr::VK_PPC_L,
+ OutContext);
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
+ case PPC::ADDIStocHA8: {
+ assert(!IsDarwin && "TOC is an ELF/XCOFF construct");
- case PPC::ADDIStocHA: {
- // Transform %xd = ADDIStocHA %x2, @sym
- LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+ // Transform %xd = ADDIStocHA8 %x2, @sym
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
- // Change the opcode to ADDIS8. If the global address is external, has
- // common linkage, is a non-local function address, or is a jump table
- // address, then generate a TOC entry and reference that. Otherwise
- // reference the symbol directly.
+ // Change the opcode to ADDIS8. If the global address is the address of
+ // an external symbol, is a jump table address, is a block address, or is a
+ // constant pool index with large code model enabled, then generate a TOC
+ // entry and reference that. Otherwise, reference the symbol directly.
TmpInst.setOpcode(PPC::ADDIS8);
+
const MachineOperand &MO = MI->getOperand(2);
- assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() ||
- MO.isBlockAddress()) &&
- "Invalid operand for ADDIStocHA!");
- MCSymbol *MOSymbol = nullptr;
- bool GlobalToc = false;
-
- if (MO.isGlobal()) {
- const GlobalValue *GV = MO.getGlobal();
- MOSymbol = getSymbol(GV);
- unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
- GlobalToc = (GVFlags & PPCII::MO_NLP_FLAG);
- } else if (MO.isCPI()) {
- MOSymbol = GetCPISymbol(MO.getIndex());
- } else if (MO.isJTI()) {
- MOSymbol = GetJTISymbol(MO.getIndex());
- } else if (MO.isBlockAddress()) {
- MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
- }
+ assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+ "Invalid operand for ADDIStocHA8!");
+
+ const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+ const bool GlobalToc =
+ MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal());
if (GlobalToc || MO.isJTI() || MO.isBlockAddress() ||
- TM.getCodeModel() == CodeModel::Large)
+ (MO.isCPI() && TM.getCodeModel() == CodeModel::Large))
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+ const MCSymbolRefExpr::VariantKind VK =
+ IsAIX ? MCSymbolRefExpr::VK_PPC_U : MCSymbolRefExpr::VK_PPC_TOC_HA;
+
const MCExpr *Exp =
- MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA,
- OutContext);
+ MCSymbolRefExpr::create(MOSymbol, VK, OutContext);
if (!MO.isJTI() && MO.getOffset())
Exp = MCBinaryExpr::createAdd(Exp,
@@ -768,73 +860,59 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case PPC::LDtocL: {
+ assert(!IsDarwin && "TOC is an ELF/XCOFF construct");
+
// Transform %xd = LDtocL @sym, %xs
- LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
- // Change the opcode to LD. If the global address is external, has
- // common linkage, or is a jump table address, then reference the
- // associated TOC entry. Otherwise reference the symbol directly.
+ // Change the opcode to LD. If the global address is the address of
+ // an external symbol, is a jump table address, is a block address, or is
+ // a constant pool index with large code model enabled, then generate a
+ // TOC entry and reference that. Otherwise, reference the symbol directly.
TmpInst.setOpcode(PPC::LD);
+
const MachineOperand &MO = MI->getOperand(1);
assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() ||
MO.isBlockAddress()) &&
"Invalid operand for LDtocL!");
- MCSymbol *MOSymbol = nullptr;
- if (MO.isJTI())
- MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
- else if (MO.isBlockAddress()) {
- MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
- MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
- }
- else if (MO.isCPI()) {
- MOSymbol = GetCPISymbol(MO.getIndex());
- if (TM.getCodeModel() == CodeModel::Large)
- MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
- }
- else if (MO.isGlobal()) {
- const GlobalValue *GV = MO.getGlobal();
- MOSymbol = getSymbol(GV);
- LLVM_DEBUG(
- unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
- assert((GVFlags & PPCII::MO_NLP_FLAG) &&
- "LDtocL used on symbol that could be accessed directly is "
- "invalid. Must match ADDIStocHA."));
+ LLVM_DEBUG(assert(
+ (!MO.isGlobal() || Subtarget->isGVIndirectSymbol(MO.getGlobal())) &&
+ "LDtocL used on symbol that could be accessed directly is "
+ "invalid. Must match ADDIStocHA8."));
+
+ const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+
+ if (!MO.isCPI() || TM.getCodeModel() == CodeModel::Large)
MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
- }
+ const MCSymbolRefExpr::VariantKind VK =
+ IsAIX ? MCSymbolRefExpr::VK_PPC_L : MCSymbolRefExpr::VK_PPC_TOC_LO;
const MCExpr *Exp =
- MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
- OutContext);
+ MCSymbolRefExpr::create(MOSymbol, VK, OutContext);
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
EmitToStreamer(*OutStreamer, TmpInst);
return;
}
case PPC::ADDItocL: {
// Transform %xd = ADDItocL %xs, @sym
- LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
- // Change the opcode to ADDI8. If the global address is external, then
- // generate a TOC entry and reference that. Otherwise reference the
+ // Change the opcode to ADDI8. If the global address is external, then
+ // generate a TOC entry and reference that. Otherwise, reference the
// symbol directly.
TmpInst.setOpcode(PPC::ADDI8);
+
const MachineOperand &MO = MI->getOperand(2);
- assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
- MCSymbol *MOSymbol = nullptr;
-
- if (MO.isGlobal()) {
- const GlobalValue *GV = MO.getGlobal();
- LLVM_DEBUG(unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
- assert(!(GVFlags & PPCII::MO_NLP_FLAG) &&
- "Interposable definitions must use indirect access."));
- MOSymbol = getSymbol(GV);
- } else if (MO.isCPI()) {
- MOSymbol = GetCPISymbol(MO.getIndex());
- }
+ assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL.");
+
+ LLVM_DEBUG(assert(
+ !(MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal())) &&
+ "Interposable definitions must use indirect access."));
const MCExpr *Exp =
- MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
- OutContext);
+ MCSymbolRefExpr::create(getMCSymbolForTOCPseudoMO(MO, *this),
+ MCSymbolRefExpr::VK_PPC_TOC_LO, OutContext);
TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
EmitToStreamer(*OutStreamer, TmpInst);
return;
@@ -842,13 +920,13 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
case PPC::ADDISgotTprelHA: {
// Transform: %xd = ADDISgotTprelHA %x2, @sym
// Into: %xd = ADDIS8 %x2, sym@got@tlsgd@ha
- assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
+ assert(IsPPC64 && "Not supported for 32-bit PowerPC");
const MachineOperand &MO = MI->getOperand(2);
const GlobalValue *GValue = MO.getGlobal();
MCSymbol *MOSymbol = getSymbol(GValue);
const MCExpr *SymGotTprel =
- MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
- OutContext);
+ MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
+ OutContext);
EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
.addReg(MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg())
@@ -858,16 +936,17 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
case PPC::LDgotTprelL:
case PPC::LDgotTprelL32: {
// Transform %xd = LDgotTprelL @sym, %xs
- LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
// Change the opcode to LD.
- TmpInst.setOpcode(isPPC64 ? PPC::LD : PPC::LWZ);
+ TmpInst.setOpcode(IsPPC64 ? PPC::LD : PPC::LWZ);
const MachineOperand &MO = MI->getOperand(1);
const GlobalValue *GValue = MO.getGlobal();
MCSymbol *MOSymbol = getSymbol(GValue);
- const MCExpr *Exp =
- MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
- OutContext);
+ const MCExpr *Exp = MCSymbolRefExpr::create(
+ MOSymbol, IsPPC64 ? MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO
+ : MCSymbolRefExpr::VK_PPC_GOT_TPREL,
+ OutContext);
TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
EmitToStreamer(*OutStreamer, TmpInst);
return;
@@ -920,7 +999,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
case PPC::ADDIStlsgdHA: {
// Transform: %xd = ADDIStlsgdHA %x2, @sym
// Into: %xd = ADDIS8 %x2, sym@got@tlsgd@ha
- assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
+ assert(IsPPC64 && "Not supported for 32-bit PowerPC");
const MachineOperand &MO = MI->getOperand(2);
const GlobalValue *GValue = MO.getGlobal();
MCSymbol *MOSymbol = getSymbol(GValue);
@@ -943,11 +1022,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
const GlobalValue *GValue = MO.getGlobal();
MCSymbol *MOSymbol = getSymbol(GValue);
const MCExpr *SymGotTlsGD = MCSymbolRefExpr::create(
- MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO
- : MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
+ MOSymbol, IsPPC64 ? MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO
+ : MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
OutContext);
EmitToStreamer(*OutStreamer,
- MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+ MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI)
.addReg(MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg())
.addExpr(SymGotTlsGD));
@@ -965,7 +1044,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
case PPC::ADDIStlsldHA: {
// Transform: %xd = ADDIStlsldHA %x2, @sym
// Into: %xd = ADDIS8 %x2, sym@got@tlsld@ha
- assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
+ assert(IsPPC64 && "Not supported for 32-bit PowerPC");
const MachineOperand &MO = MI->getOperand(2);
const GlobalValue *GValue = MO.getGlobal();
MCSymbol *MOSymbol = getSymbol(GValue);
@@ -988,11 +1067,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
const GlobalValue *GValue = MO.getGlobal();
MCSymbol *MOSymbol = getSymbol(GValue);
const MCExpr *SymGotTlsLD = MCSymbolRefExpr::create(
- MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO
- : MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
+ MOSymbol, IsPPC64 ? MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO
+ : MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
OutContext);
EmitToStreamer(*OutStreamer,
- MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+ MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI)
.addReg(MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg())
.addExpr(SymGotTlsLD));
@@ -1021,7 +1100,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
OutContext);
EmitToStreamer(
*OutStreamer,
- MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDIS8 : PPC::ADDIS)
+ MCInstBuilder(IsPPC64 ? PPC::ADDIS8 : PPC::ADDIS)
.addReg(MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg())
.addExpr(SymDtprel));
@@ -1040,7 +1119,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
OutContext);
EmitToStreamer(*OutStreamer,
- MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+ MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI)
.addReg(MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg())
.addExpr(SymDtprel));
@@ -1087,7 +1166,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// suite shows a handful of test cases that fail this check for
// Darwin. Those need to be investigated before this sanity test
// can be enabled for those subtargets.
- if (!Subtarget->isDarwin()) {
+ if (!IsDarwin) {
unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
const MachineOperand &MO = MI->getOperand(OpNum);
if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
@@ -1098,7 +1177,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
}
- LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+ LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
EmitToStreamer(*OutStreamer, TmpInst);
}
@@ -1368,15 +1447,16 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
OutStreamer->SwitchSection(Section);
- for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(),
- E = TOC.end(); I != E; ++I) {
- OutStreamer->EmitLabel(I->second);
- MCSymbol *S = I->first;
+ for (const auto &TOCMapPair : TOC) {
+ const MCSymbol *const TOCEntryTarget = TOCMapPair.first;
+ MCSymbol *const TOCEntryLabel = TOCMapPair.second;
+
+ OutStreamer->EmitLabel(TOCEntryLabel);
if (isPPC64) {
- TS.emitTCEntry(*S);
+ TS.emitTCEntry(*TOCEntryTarget);
} else {
OutStreamer->EmitValueToAlignment(4);
- OutStreamer->EmitSymbolValue(S, 4);
+ OutStreamer->EmitSymbolValue(TOCEntryTarget, 4);
}
}
}
@@ -1602,7 +1682,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
if (!Stubs.empty()) {
// Switch with ".non_lazy_symbol_pointer" directive.
OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
- EmitAlignment(isPPC64 ? 3 : 2);
+ EmitAlignment(isPPC64 ? Align(8) : Align(4));
for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
// L_foo$stub:
@@ -1643,6 +1723,106 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
return AsmPrinter::doFinalization(M);
}
+void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) {
+ // Get the function descriptor symbol.
+ CurrentFnDescSym = getSymbol(&MF.getFunction());
+ // Set the containing csect.
+ MCSectionXCOFF *FnDescSec = OutStreamer->getContext().getXCOFFSection(
+ CurrentFnDescSym->getName(), XCOFF::XMC_DS, XCOFF::XTY_SD,
+ XCOFF::C_HIDEXT, SectionKind::getData());
+ cast<MCSymbolXCOFF>(CurrentFnDescSym)->setContainingCsect(FnDescSec);
+
+ return AsmPrinter::SetupMachineFunction(MF);
+}
+
+void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+ // Early error checking limiting what is supported.
+ if (GV->isThreadLocal())
+ report_fatal_error("Thread local not yet supported on AIX.");
+
+ if (GV->hasSection())
+ report_fatal_error("Custom section for Data not yet supported.");
+
+ if (GV->hasComdat())
+ report_fatal_error("COMDAT not yet supported by AIX.");
+
+ SectionKind GVKind = getObjFileLowering().getKindForGlobal(GV, TM);
+ if (!GVKind.isCommon() && !GVKind.isBSSLocal() && !GVKind.isData())
+ report_fatal_error("Encountered a global variable kind that is "
+ "not supported yet.");
+
+ // Create the containing csect and switch to it.
+ MCSectionXCOFF *CSect = cast<MCSectionXCOFF>(
+ getObjFileLowering().SectionForGlobal(GV, GVKind, TM));
+ OutStreamer->SwitchSection(CSect);
+
+ // Create the symbol, set its storage class, and emit it.
+ MCSymbolXCOFF *GVSym = cast<MCSymbolXCOFF>(getSymbol(GV));
+ GVSym->setStorageClass(
+ TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV));
+ GVSym->setContainingCsect(CSect);
+
+ const DataLayout &DL = GV->getParent()->getDataLayout();
+
+ // Handle common symbols.
+ if (GVKind.isCommon() || GVKind.isBSSLocal()) {
+ unsigned Align =
+ GV->getAlignment() ? GV->getAlignment() : DL.getPreferredAlignment(GV);
+ uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+
+ if (GVKind.isBSSLocal())
+ OutStreamer->EmitXCOFFLocalCommonSymbol(GVSym, Size, Align);
+ else
+ OutStreamer->EmitCommonSymbol(GVSym, Size, Align);
+ return;
+ }
+
+ MCSymbol *EmittedInitSym = GVSym;
+ EmitLinkage(GV, EmittedInitSym);
+ EmitAlignment(getGVAlignment(GV, DL), GV);
+ OutStreamer->EmitLabel(EmittedInitSym);
+ EmitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
+}
+
+void PPCAIXAsmPrinter::EmitFunctionDescriptor() {
+ const DataLayout &DL = getDataLayout();
+ const unsigned PointerSize = DL.getPointerSizeInBits() == 64 ? 8 : 4;
+
+ MCSectionSubPair Current = OutStreamer->getCurrentSection();
+ // Emit function descriptor.
+ OutStreamer->SwitchSection(
+ cast<MCSymbolXCOFF>(CurrentFnDescSym)->getContainingCsect());
+ OutStreamer->EmitLabel(CurrentFnDescSym);
+ // Emit function entry point address.
+ OutStreamer->EmitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext),
+ PointerSize);
+ // Emit TOC base address.
+ MCSymbol *TOCBaseSym = OutContext.getOrCreateSymbol(StringRef("TOC[TC0]"));
+ OutStreamer->EmitValue(MCSymbolRefExpr::create(TOCBaseSym, OutContext),
+ PointerSize);
+ // Emit a null environment pointer.
+ OutStreamer->EmitIntValue(0, PointerSize);
+
+ OutStreamer->SwitchSection(Current.first, Current.second);
+}
+
+void PPCAIXAsmPrinter::EmitEndOfAsmFile(Module &M) {
+ // If there are no functions in this module, we will never need to reference
+ // the TOC base.
+ if (M.empty())
+ return;
+
+ // Emit TOC base.
+ MCSymbol *TOCBaseSym = OutContext.getOrCreateSymbol(StringRef("TOC[TC0]"));
+ MCSectionXCOFF *TOCBaseSection = OutStreamer->getContext().getXCOFFSection(
+ StringRef("TOC"), XCOFF::XMC_TC0, XCOFF::XTY_SD, XCOFF::C_HIDEXT,
+ SectionKind::getData());
+ cast<MCSymbolXCOFF>(TOCBaseSym)->setContainingCsect(TOCBaseSection);
+ // Switch to section to emit TOC base.
+ OutStreamer->SwitchSection(TOCBaseSection);
+}
+
+
/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
/// for a MachineFunction to the given output stream, in a format that the
/// Darwin assembler can deal with.
diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
index 5e9a661f8f0b..d325b078979f 100644
--- a/lib/Target/PowerPC/PPCBranchCoalescing.cpp
+++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -340,9 +340,10 @@ bool PPCBranchCoalescing::identicalOperands(
if (Op1.isIdenticalTo(Op2)) {
// filter out instructions with physical-register uses
- if (Op1.isReg() && TargetRegisterInfo::isPhysicalRegister(Op1.getReg())
- // If the physical register is constant then we can assume the value
- // has not changed between uses.
+ if (Op1.isReg() &&
+ Register::isPhysicalRegister(Op1.getReg())
+ // If the physical register is constant then we can assume the value
+ // has not changed between uses.
&& !(Op1.isUse() && MRI->isConstantPhysReg(Op1.getReg()))) {
LLVM_DEBUG(dbgs() << "The operands are not provably identical.\n");
return false;
@@ -355,8 +356,8 @@ bool PPCBranchCoalescing::identicalOperands(
// definition of the register produces the same value. If they produce the
// same value, consider them to be identical.
if (Op1.isReg() && Op2.isReg() &&
- TargetRegisterInfo::isVirtualRegister(Op1.getReg()) &&
- TargetRegisterInfo::isVirtualRegister(Op2.getReg())) {
+ Register::isVirtualRegister(Op1.getReg()) &&
+ Register::isVirtualRegister(Op2.getReg())) {
MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg());
MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg());
if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) {
@@ -456,7 +457,7 @@ bool PPCBranchCoalescing::canMoveToEnd(const MachineInstr &MI,
<< TargetMBB.getNumber() << "\n");
for (auto &Use : MI.uses()) {
- if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) {
+ if (Use.isReg() && Register::isVirtualRegister(Use.getReg())) {
MachineInstr *DefInst = MRI->getVRegDef(Use.getReg());
if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) {
LLVM_DEBUG(dbgs() << " *** Cannot move this instruction ***\n");
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 793d690baec3..cdff4d383d23 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -81,21 +81,20 @@ FunctionPass *llvm::createPPCBranchSelectionPass() {
/// original Offset.
unsigned PPCBSel::GetAlignmentAdjustment(MachineBasicBlock &MBB,
unsigned Offset) {
- unsigned Align = MBB.getAlignment();
- if (!Align)
+ const Align Alignment = MBB.getAlignment();
+ if (Alignment == Align::None())
return 0;
- unsigned AlignAmt = 1 << Align;
- unsigned ParentAlign = MBB.getParent()->getAlignment();
+ const Align ParentAlign = MBB.getParent()->getAlignment();
- if (Align <= ParentAlign)
- return OffsetToAlignment(Offset, AlignAmt);
+ if (Alignment <= ParentAlign)
+ return offsetToAlignment(Offset, Alignment);
// The alignment of this MBB is larger than the function's alignment, so we
// can't tell whether or not it will insert nops. Assume that it will.
if (FirstImpreciseBlock < 0)
FirstImpreciseBlock = MBB.getNumber();
- return AlignAmt + OffsetToAlignment(Offset, AlignAmt);
+ return Alignment.value() + offsetToAlignment(Offset, Alignment);
}
/// We need to be careful about the offset of the first block in the function
@@ -179,7 +178,7 @@ int PPCBSel::computeBranchSize(MachineFunction &Fn,
const MachineBasicBlock *Dest,
unsigned BrOffset) {
int BranchSize;
- unsigned MaxAlign = 2;
+ Align MaxAlign = Align(4);
bool NeedExtraAdjustment = false;
if (Dest->getNumber() <= Src->getNumber()) {
// If this is a backwards branch, the delta is the offset from the
@@ -192,8 +191,7 @@ int PPCBSel::computeBranchSize(MachineFunction &Fn,
BranchSize += BlockSizes[DestBlock].first;
for (unsigned i = DestBlock+1, e = Src->getNumber(); i < e; ++i) {
BranchSize += BlockSizes[i].first;
- MaxAlign = std::max(MaxAlign,
- Fn.getBlockNumbered(i)->getAlignment());
+ MaxAlign = std::max(MaxAlign, Fn.getBlockNumbered(i)->getAlignment());
}
NeedExtraAdjustment = (FirstImpreciseBlock >= 0) &&
@@ -207,8 +205,7 @@ int PPCBSel::computeBranchSize(MachineFunction &Fn,
MaxAlign = std::max(MaxAlign, Dest->getAlignment());
for (unsigned i = StartBlock+1, e = Dest->getNumber(); i != e; ++i) {
BranchSize += BlockSizes[i].first;
- MaxAlign = std::max(MaxAlign,
- Fn.getBlockNumbered(i)->getAlignment());
+ MaxAlign = std::max(MaxAlign, Fn.getBlockNumbered(i)->getAlignment());
}
NeedExtraAdjustment = (FirstImpreciseBlock >= 0) &&
@@ -258,7 +255,7 @@ int PPCBSel::computeBranchSize(MachineFunction &Fn,
// The computed offset is at most ((1 << alignment) - 4) bytes smaller
// than actual offset. So we add this number to the offset for safety.
if (NeedExtraAdjustment)
- BranchSize += (1 << MaxAlign) - 4;
+ BranchSize += MaxAlign.value() - 4;
return BranchSize;
}
@@ -339,16 +336,16 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
// 1. CR register
// 2. Target MBB
PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
- unsigned CRReg = I->getOperand(1).getReg();
+ Register CRReg = I->getOperand(1).getReg();
// Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
BuildMI(MBB, I, dl, TII->get(PPC::BCC))
.addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
} else if (I->getOpcode() == PPC::BC) {
- unsigned CRBit = I->getOperand(0).getReg();
+ Register CRBit = I->getOperand(0).getReg();
BuildMI(MBB, I, dl, TII->get(PPC::BCn)).addReg(CRBit).addImm(2);
} else if (I->getOpcode() == PPC::BCn) {
- unsigned CRBit = I->getOperand(0).getReg();
+ Register CRBit = I->getOperand(0).getReg();
BuildMI(MBB, I, dl, TII->get(PPC::BC)).addReg(CRBit).addImm(2);
} else if (I->getOpcode() == PPC::BDNZ) {
BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2);
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 264d6b590f95..d8425d89da92 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -162,7 +162,7 @@ class PPCFastISel final : public FastISel {
bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
bool isZExt, unsigned DestReg,
const PPC::Predicate Pred);
- bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+ bool PPCEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
const TargetRegisterClass *RC, bool IsZExt = true,
unsigned FP64LoadOpc = PPC::LFD);
bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr);
@@ -451,7 +451,7 @@ void PPCFastISel::PPCSimplifyAddress(Address &Addr, bool &UseOffset,
// Emit a load instruction if possible, returning true if we succeeded,
// otherwise false. See commentary below for how the register class of
// the load is determined.
-bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+bool PPCFastISel::PPCEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
const TargetRegisterClass *RC,
bool IsZExt, unsigned FP64LoadOpc) {
unsigned Opc;
@@ -469,7 +469,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
(ResultReg ? MRI.getRegClass(ResultReg) :
(RC ? RC :
(VT == MVT::f64 ? (HasSPE ? &PPC::SPERCRegClass : &PPC::F8RCRegClass) :
- (VT == MVT::f32 ? (HasSPE ? &PPC::SPE4RCRegClass : &PPC::F4RCRegClass) :
+ (VT == MVT::f32 ? (HasSPE ? &PPC::GPRCRegClass : &PPC::F4RCRegClass) :
(VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
&PPC::GPRC_and_GPRC_NOR0RegClass)))));
@@ -612,7 +612,7 @@ bool PPCFastISel::SelectLoad(const Instruction *I) {
const TargetRegisterClass *RC =
AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
- unsigned ResultReg = 0;
+ Register ResultReg = 0;
if (!PPCEmitLoad(VT, ResultReg, Addr, RC, true,
PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
return false;
@@ -989,7 +989,7 @@ bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
unsigned DestReg;
auto RC = MRI.getRegClass(SrcReg);
if (PPCSubTarget->hasSPE()) {
- DestReg = createResultReg(&PPC::SPE4RCRegClass);
+ DestReg = createResultReg(&PPC::GPRCRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(PPC::EFSCFD), DestReg)
.addReg(SrcReg);
@@ -1051,7 +1051,7 @@ unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
}
const TargetRegisterClass *RC = &PPC::F8RCRegClass;
- unsigned ResultReg = 0;
+ Register ResultReg = 0;
if (!PPCEmitLoad(MVT::f64, ResultReg, Addr, RC, !IsSigned, LoadOpc))
return 0;
@@ -1176,7 +1176,7 @@ unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
const TargetRegisterClass *RC =
AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
- unsigned ResultReg = 0;
+ Register ResultReg = 0;
if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned))
return 0;
@@ -1229,9 +1229,9 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
if (PPCSubTarget->hasSPE()) {
DestReg = createResultReg(&PPC::GPRCRegClass);
if (IsSigned)
- Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTSIZ : PPC::EFDCTSIZ;
+ Opc = InRC == &PPC::GPRCRegClass ? PPC::EFSCTSIZ : PPC::EFDCTSIZ;
else
- Opc = InRC == &PPC::SPE4RCRegClass ? PPC::EFSCTUIZ : PPC::EFDCTUIZ;
+ Opc = InRC == &PPC::GPRCRegClass ? PPC::EFSCTUIZ : PPC::EFDCTUIZ;
} else if (isVSFRCRegClass(RC)) {
DestReg = createResultReg(&PPC::VSFRCRegClass);
if (DstVT == MVT::i32)
@@ -1717,7 +1717,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
if (const ConstantInt *CI = dyn_cast<ConstantInt>(RV)) {
CCValAssign &VA = ValLocs[0];
- unsigned RetReg = VA.getLocReg();
+ Register RetReg = VA.getLocReg();
// We still need to worry about properly extending the sign. For example,
// we could have only a single bit or a constant that needs zero
// extension rather than sign extension. Make sure we pass the return
@@ -2002,7 +2002,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
const bool HasSPE = PPCSubTarget->hasSPE();
const TargetRegisterClass *RC;
if (HasSPE)
- RC = ((VT == MVT::f32) ? &PPC::SPE4RCRegClass : &PPC::SPERCRegClass);
+ RC = ((VT == MVT::f32) ? &PPC::GPRCRegClass : &PPC::SPERCRegClass);
else
RC = ((VT == MVT::f32) ? &PPC::F4RCRegClass : &PPC::F8RCRegClass);
@@ -2031,8 +2031,8 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
.addImm(0).addReg(TmpReg).addMemOperand(MMO);
} else {
- // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)).
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
+ // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA8(X2, Idx)).
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA8),
TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx);
// But for large code model, we must generate a LDtocL followed
// by the LF[SD].
@@ -2085,16 +2085,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
// or externally available linkage, a non-local function address, or a
// jump table address (not yet needed), or if we are generating code
// for large code model, we generate:
- // LDtocL(GV, ADDIStocHA(%x2, GV))
+ // LDtocL(GV, ADDIStocHA8(%x2, GV))
// Otherwise we generate:
- // ADDItocL(ADDIStocHA(%x2, GV), GV)
- // Either way, start with the ADDIStocHA:
+ // ADDItocL(ADDIStocHA8(%x2, GV), GV)
+ // Either way, start with the ADDIStocHA8:
unsigned HighPartReg = createResultReg(RC);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA8),
HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
- unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
- if (GVFlags & PPCII::MO_NLP_FLAG) {
+ if (PPCSubTarget->isGVIndirectSymbol(GV)) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
DestReg).addGlobalAddress(GV).addReg(HighPartReg);
} else {
@@ -2353,7 +2352,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
if (!PPCComputeAddress(LI->getOperand(0), Addr))
return false;
- unsigned ResultReg = MI->getOperand(0).getReg();
+ Register ResultReg = MI->getOperand(0).getReg();
if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt,
PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
@@ -2464,7 +2463,7 @@ namespace llvm {
const TargetLibraryInfo *LibInfo) {
// Only available on 64-bit ELF for now.
const PPCSubtarget &Subtarget = FuncInfo.MF->getSubtarget<PPCSubtarget>();
- if (Subtarget.isPPC64() && Subtarget.isSVR4ABI())
+ if (Subtarget.is64BitELFABI())
return new PPCFastISel(FuncInfo, LibInfo);
return nullptr;
}
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index ebfb1ef7f49b..06a4d183e781 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -47,13 +47,15 @@ static const MCPhysReg VRRegNo[] = {
};
static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) {
- if (STI.isDarwinABI())
+ if (STI.isDarwinABI() || STI.isAIXABI())
return STI.isPPC64() ? 16 : 8;
// SVR4 ABI:
return STI.isPPC64() ? 16 : 4;
}
static unsigned computeTOCSaveOffset(const PPCSubtarget &STI) {
+ if (STI.isAIXABI())
+ return STI.isPPC64() ? 40 : 20;
return STI.isELFv2ABI() ? 24 : 40;
}
@@ -88,6 +90,11 @@ static unsigned computeBasePointerSaveOffset(const PPCSubtarget &STI) {
: STI.getTargetMachine().isPositionIndependent() ? -12U : -8U;
}
+static unsigned computeCRSaveOffset() {
+ // The condition register save offset needs to be updated for AIX PPC32.
+ return 8;
+}
+
PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
: TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
STI.getPlatformStackAlignment(), 0),
@@ -95,7 +102,8 @@ PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
TOCSaveOffset(computeTOCSaveOffset(Subtarget)),
FramePointerSaveOffset(computeFramePointerSaveOffset(Subtarget)),
LinkageSize(computeLinkageSize(Subtarget)),
- BasePointerSaveOffset(computeBasePointerSaveOffset(STI)) {}
+ BasePointerSaveOffset(computeBasePointerSaveOffset(Subtarget)),
+ CRSaveOffset(computeCRSaveOffset()) {}
// With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
@@ -370,8 +378,8 @@ static void HandleVRSaveUpdate(MachineInstr &MI, const TargetInstrInfo &TII) {
return;
}
- unsigned SrcReg = MI.getOperand(1).getReg();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
if ((UsedRegMask & 0xFFFF) == UsedRegMask) {
if (DstReg != SrcReg)
@@ -781,15 +789,18 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
bool isPPC64 = Subtarget.isPPC64();
// Get the ABI.
bool isSVR4ABI = Subtarget.isSVR4ABI();
+ bool isAIXABI = Subtarget.isAIXABI();
bool isELFv2ABI = Subtarget.isELFv2ABI();
- assert((Subtarget.isDarwinABI() || isSVR4ABI) &&
- "Currently only Darwin and SVR4 ABIs are supported for PowerPC.");
+ assert((Subtarget.isDarwinABI() || isSVR4ABI || isAIXABI) &&
+ "Unsupported PPC ABI.");
// Scan the prolog, looking for an UPDATE_VRSAVE instruction. If we find it,
// process it.
if (!isSVR4ABI)
for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
+ if (isAIXABI)
+ report_fatal_error("UPDATE_VRSAVE is unexpected on AIX.");
HandleVRSaveUpdate(*MBBI, TII);
break;
}
@@ -819,7 +830,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
bool HasRedZone = isPPC64 || !isSVR4ABI;
unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1;
- unsigned BPReg = RegInfo->getBaseRegister(MF);
+ Register BPReg = RegInfo->getBaseRegister(MF);
unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31;
unsigned LRReg = isPPC64 ? PPC::LR8 : PPC::LR;
unsigned TOCReg = isPPC64 ? PPC::X2 : PPC::R2;
@@ -908,6 +919,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
assert((isPPC64 || !MustSaveCR) &&
"Prologue CR saving supported only in 64-bit mode");
+ if (MustSaveCR && isAIXABI)
+ report_fatal_error("Prologue CR saving is unimplemented on AIX.");
+
// Check if we can move the stack update instruction (stdu) down the prologue
// past the callee saves. Hopefully this will avoid the situation where the
// saves are waiting for the update on the store with update to complete.
@@ -966,7 +980,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
MIB.addReg(MustSaveCRs[i], CrState);
BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
.addReg(TempReg, getKillRegState(true))
- .addImm(8)
+ .addImm(getCRSaveOffset())
.addReg(SPReg);
}
@@ -1020,7 +1034,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
assert(HasRedZone && "A red zone is always available on PPC64");
BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
.addReg(TempReg, getKillRegState(true))
- .addImm(8)
+ .addImm(getCRSaveOffset())
.addReg(SPReg);
}
@@ -1324,7 +1338,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
// actually saved gets its own CFI record.
unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2;
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
- nullptr, MRI->getDwarfRegNum(CRReg, true), 8));
+ nullptr, MRI->getDwarfRegNum(CRReg, true), getCRSaveOffset()));
BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
continue;
@@ -1387,7 +1401,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1;
- unsigned BPReg = RegInfo->getBaseRegister(MF);
+ Register BPReg = RegInfo->getBaseRegister(MF);
unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31;
unsigned ScratchReg = 0;
unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
@@ -1590,7 +1604,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
// is live here.
assert(HasRedZone && "Expecting red zone");
BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
- .addImm(8)
+ .addImm(getCRSaveOffset())
.addReg(SPReg);
for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
@@ -1614,7 +1628,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
assert(isPPC64 && "Expecting 64-bit mode");
assert(RBReg == SPReg && "Should be using SP as a base register");
BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
- .addImm(8)
+ .addImm(getCRSaveOffset())
.addReg(RBReg);
}
@@ -1762,8 +1776,8 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
// Save R31 if necessary
int FPSI = FI->getFramePointerSaveIndex();
- bool isPPC64 = Subtarget.isPPC64();
- bool isDarwinABI = Subtarget.isDarwinABI();
+ const bool isPPC64 = Subtarget.isPPC64();
+ const bool IsDarwinABI = Subtarget.isDarwinABI();
MachineFrameInfo &MFI = MF.getFrameInfo();
// If the frame pointer save index hasn't been defined yet.
@@ -1812,7 +1826,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
// For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the
// function uses CR 2, 3, or 4.
- if (!isPPC64 && !isDarwinABI &&
+ if (!isPPC64 && !IsDarwinABI &&
(SavedRegs.test(PPC::CR2) ||
SavedRegs.test(PPC::CR3) ||
SavedRegs.test(PPC::CR4))) {
@@ -1872,8 +1886,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
assert((!MF.getInfo<PPCFunctionInfo>()->mustSaveTOC() ||
(Reg != PPC::X2 && Reg != PPC::R2)) &&
"Not expecting to try to spill R2 in a function that must save TOC");
- if (PPC::GPRCRegClass.contains(Reg) ||
- PPC::SPE4RCRegClass.contains(Reg)) {
+ if (PPC::GPRCRegClass.contains(Reg)) {
HasGPSaveArea = true;
GPRegs.push_back(CSI[i]);
@@ -1967,7 +1980,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
assert(FI && "No Base Pointer Save Slot!");
MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
- unsigned BP = RegInfo->getBaseRegister(MF);
+ Register BP = RegInfo->getBaseRegister(MF);
if (PPC::G8RCRegClass.contains(BP)) {
MinG8R = std::min<unsigned>(MinG8R, BP);
HasG8SaveArea = true;
@@ -2428,6 +2441,26 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
return true;
}
+unsigned PPCFrameLowering::getTOCSaveOffset() const {
+ if (Subtarget.isAIXABI())
+ // TOC save/restore is normally handled by the linker.
+ // Indirect calls should hit this limitation.
+ report_fatal_error("TOC save is not implemented on AIX yet.");
+ return TOCSaveOffset;
+}
+
+unsigned PPCFrameLowering::getFramePointerSaveOffset() const {
+ if (Subtarget.isAIXABI())
+ report_fatal_error("FramePointer is not implemented on AIX yet.");
+ return FramePointerSaveOffset;
+}
+
+unsigned PPCFrameLowering::getBasePointerSaveOffset() const {
+ if (Subtarget.isAIXABI())
+ report_fatal_error("BasePointer is not implemented on AIX yet.");
+ return BasePointerSaveOffset;
+}
+
bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
if (MF.getInfo<PPCFunctionInfo>()->shrinkWrapDisabled())
return false;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index d116e9fd22e1..a5fbc9acbb28 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -26,6 +26,7 @@ class PPCFrameLowering: public TargetFrameLowering {
const unsigned FramePointerSaveOffset;
const unsigned LinkageSize;
const unsigned BasePointerSaveOffset;
+ const unsigned CRSaveOffset;
/**
* Find register[s] that can be used in function prologue and epilogue
@@ -142,15 +143,19 @@ public:
/// getTOCSaveOffset - Return the previous frame offset to save the
/// TOC register -- 64-bit SVR4 ABI only.
- unsigned getTOCSaveOffset() const { return TOCSaveOffset; }
+ unsigned getTOCSaveOffset() const;
/// getFramePointerSaveOffset - Return the previous frame offset to save the
/// frame pointer.
- unsigned getFramePointerSaveOffset() const { return FramePointerSaveOffset; }
+ unsigned getFramePointerSaveOffset() const;
/// getBasePointerSaveOffset - Return the previous frame offset to save the
/// base pointer.
- unsigned getBasePointerSaveOffset() const { return BasePointerSaveOffset; }
+ unsigned getBasePointerSaveOffset() const;
+
+ /// getCRSaveOffset - Return the previous frame offset to save the
+ /// CR register.
+ unsigned getCRSaveOffset() const { return CRSaveOffset; }
/// getLinkageSize - Return the size of the PowerPC ABI linkage area.
///
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 543cac075f55..4ad6c88233fe 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -371,7 +371,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
// by the scheduler. Detect them now.
bool HasVectorVReg = false;
for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) {
- unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+ unsigned Reg = Register::index2VirtReg(i);
if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) {
HasVectorVReg = true;
break;
@@ -391,8 +391,8 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
// Create two vregs - one to hold the VRSAVE register that is live-in to the
// function and one for the value after having bits or'd into it.
- unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
- unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+ Register InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+ Register UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
MachineBasicBlock &EntryBB = *Fn.begin();
@@ -447,7 +447,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
} else {
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
- unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+ Register TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
BuildMI(FirstMBB, MBBI, dl,
TII.get(PPC::UpdateGBR), GlobalBaseReg)
.addReg(TempReg, RegState::Define).addReg(GlobalBaseReg);
@@ -5065,52 +5065,95 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
return;
}
case PPCISD::TOC_ENTRY: {
- assert ((PPCSubTarget->isPPC64() || PPCSubTarget->isSVR4ABI()) &&
- "Only supported for 64-bit ABI and 32-bit SVR4");
- if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) {
- SDValue GA = N->getOperand(0);
- SDNode *MN = CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
- N->getOperand(1));
- transferMemOperands(N, MN);
- ReplaceNode(N, MN);
- return;
- }
+ const bool isPPC64 = PPCSubTarget->isPPC64();
+ const bool isELFABI = PPCSubTarget->isSVR4ABI();
+ const bool isAIXABI = PPCSubTarget->isAIXABI();
+
+ assert(!PPCSubTarget->isDarwin() && "TOC is an ELF/XCOFF construct");
+
+ // PowerPC only support small, medium and large code model.
+ const CodeModel::Model CModel = TM.getCodeModel();
+ assert(!(CModel == CodeModel::Tiny || CModel == CodeModel::Kernel) &&
+ "PowerPC doesn't support tiny or kernel code models.");
- // For medium and large code model, we generate two instructions as
- // described below. Otherwise we allow SelectCodeCommon to handle this,
+ if (isAIXABI && CModel == CodeModel::Medium)
+ report_fatal_error("Medium code model is not supported on AIX.");
+
+ // For 64-bit small code model, we allow SelectCodeCommon to handle this,
// selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA.
- CodeModel::Model CModel = TM.getCodeModel();
- if (CModel != CodeModel::Medium && CModel != CodeModel::Large)
+ if (isPPC64 && CModel == CodeModel::Small)
break;
- // The first source operand is a TargetGlobalAddress or a TargetJumpTable.
- // If it must be toc-referenced according to PPCSubTarget, we generate:
- // LDtocL(@sym, ADDIStocHA(%x2, @sym))
+ // Handle 32-bit small code model.
+ if (!isPPC64) {
+ // Transforms the ISD::TOC_ENTRY node to a PPCISD::LWZtoc.
+ auto replaceWithLWZtoc = [this, &dl](SDNode *TocEntry) {
+ SDValue GA = TocEntry->getOperand(0);
+ SDValue TocBase = TocEntry->getOperand(1);
+ SDNode *MN = CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
+ TocBase);
+ transferMemOperands(TocEntry, MN);
+ ReplaceNode(TocEntry, MN);
+ };
+
+ if (isELFABI) {
+ assert(TM.isPositionIndependent() &&
+ "32-bit ELF can only have TOC entries in position independent"
+ " code.");
+ // 32-bit ELF always uses a small code model toc access.
+ replaceWithLWZtoc(N);
+ return;
+ }
+
+ if (isAIXABI && CModel == CodeModel::Small) {
+ replaceWithLWZtoc(N);
+ return;
+ }
+ }
+
+ assert(CModel != CodeModel::Small && "All small code models handled.");
+
+ assert((isPPC64 || (isAIXABI && !isPPC64)) && "We are dealing with 64-bit"
+ " ELF/AIX or 32-bit AIX in the following.");
+
+ // Transforms the ISD::TOC_ENTRY node for 32-bit AIX large code model mode
+ // or 64-bit medium (ELF-only) or large (ELF and AIX) code model code. We
+ // generate two instructions as described below. The first source operand
+ // is a symbol reference. If it must be toc-referenced according to
+ // PPCSubTarget, we generate:
+ // [32-bit AIX]
+ // LWZtocL(@sym, ADDIStocHA(%r2, @sym))
+ // [64-bit ELF/AIX]
+ // LDtocL(@sym, ADDIStocHA8(%x2, @sym))
// Otherwise we generate:
- // ADDItocL(ADDIStocHA(%x2, @sym), @sym)
+ // ADDItocL(ADDIStocHA8(%x2, @sym), @sym)
SDValue GA = N->getOperand(0);
SDValue TOCbase = N->getOperand(1);
- SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
- TOCbase, GA);
+
+ EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+ SDNode *Tmp = CurDAG->getMachineNode(
+ isPPC64 ? PPC::ADDIStocHA8 : PPC::ADDIStocHA, dl, VT, TOCbase, GA);
+
if (PPCLowering->isAccessedAsGotIndirect(GA)) {
- // If it is access as got-indirect, we need an extra LD to load
+ // If it is accessed as got-indirect, we need an extra LWZ/LD to load
// the address.
- SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
- SDValue(Tmp, 0));
+ SDNode *MN = CurDAG->getMachineNode(
+ isPPC64 ? PPC::LDtocL : PPC::LWZtocL, dl, VT, GA, SDValue(Tmp, 0));
+
transferMemOperands(N, MN);
ReplaceNode(N, MN);
return;
}
- // Build the address relative to the TOC-pointer..
+ // Build the address relative to the TOC-pointer.
ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
SDValue(Tmp, 0), GA));
return;
}
case PPCISD::PPC32_PICGOT:
// Generate a PIC-safe GOT reference.
- assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() &&
- "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
+ assert(PPCSubTarget->is32BitELFABI() &&
+ "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT,
PPCLowering->getPointerTy(CurDAG->getDataLayout()),
MVT::i32);
@@ -6456,7 +6499,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
continue;
if (!HBase.isMachineOpcode() ||
- HBase.getMachineOpcode() != PPC::ADDIStocHA)
+ HBase.getMachineOpcode() != PPC::ADDIStocHA8)
continue;
if (!Base.hasOneUse() || !HBase.hasOneUse())
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 24d50074860d..8cf6a660b08b 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -139,13 +139,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
// arguments are at least 4/8 bytes aligned.
bool isPPC64 = Subtarget.isPPC64();
- setMinStackArgumentAlignment(isPPC64 ? 8:4);
+ setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4));
// Set up the register classes.
addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
if (!useSoftFloat()) {
if (hasSPE()) {
- addRegisterClass(MVT::f32, &PPC::SPE4RCRegClass);
+ addRegisterClass(MVT::f32, &PPC::GPRCRegClass);
addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
} else {
addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
@@ -431,28 +431,26 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
setOperationAction(ISD::VASTART , MVT::Other, Custom);
- if (Subtarget.isSVR4ABI()) {
- if (isPPC64) {
- // VAARG always uses double-word chunks, so promote anything smaller.
- setOperationAction(ISD::VAARG, MVT::i1, Promote);
- AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
- setOperationAction(ISD::VAARG, MVT::i8, Promote);
- AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64);
- setOperationAction(ISD::VAARG, MVT::i16, Promote);
- AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64);
- setOperationAction(ISD::VAARG, MVT::i32, Promote);
- AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64);
- setOperationAction(ISD::VAARG, MVT::Other, Expand);
- } else {
- // VAARG is custom lowered with the 32-bit SVR4 ABI.
- setOperationAction(ISD::VAARG, MVT::Other, Custom);
- setOperationAction(ISD::VAARG, MVT::i64, Custom);
- }
+ if (Subtarget.is64BitELFABI()) {
+ // VAARG always uses double-word chunks, so promote anything smaller.
+ setOperationAction(ISD::VAARG, MVT::i1, Promote);
+ AddPromotedToType(ISD::VAARG, MVT::i1, MVT::i64);
+ setOperationAction(ISD::VAARG, MVT::i8, Promote);
+ AddPromotedToType(ISD::VAARG, MVT::i8, MVT::i64);
+ setOperationAction(ISD::VAARG, MVT::i16, Promote);
+ AddPromotedToType(ISD::VAARG, MVT::i16, MVT::i64);
+ setOperationAction(ISD::VAARG, MVT::i32, Promote);
+ AddPromotedToType(ISD::VAARG, MVT::i32, MVT::i64);
+ setOperationAction(ISD::VAARG, MVT::Other, Expand);
+ } else if (Subtarget.is32BitELFABI()) {
+ // VAARG is custom lowered with the 32-bit SVR4 ABI.
+ setOperationAction(ISD::VAARG, MVT::Other, Custom);
+ setOperationAction(ISD::VAARG, MVT::i64, Custom);
} else
setOperationAction(ISD::VAARG, MVT::Other, Expand);
- if (Subtarget.isSVR4ABI() && !isPPC64)
- // VACOPY is custom lowered with the 32-bit SVR4 ABI.
+ // VACOPY is custom lowered with the 32-bit SVR4 ABI.
+ if (Subtarget.is32BitELFABI())
setOperationAction(ISD::VACOPY , MVT::Other, Custom);
else
setOperationAction(ISD::VACOPY , MVT::Other, Expand);
@@ -553,17 +551,25 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
if (Subtarget.hasAltivec()) {
// First set operation action for all vector types to expand. Then we
// will selectively turn on ones that can be effectively codegen'd.
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
// add/sub are legal for all supported vector VT's.
setOperationAction(ISD::ADD, VT, Legal);
setOperationAction(ISD::SUB, VT, Legal);
// For v2i64, these are only valid with P8Vector. This is corrected after
// the loop.
- setOperationAction(ISD::SMAX, VT, Legal);
- setOperationAction(ISD::SMIN, VT, Legal);
- setOperationAction(ISD::UMAX, VT, Legal);
- setOperationAction(ISD::UMIN, VT, Legal);
+ if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) {
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ }
+ else {
+ setOperationAction(ISD::SMAX, VT, Expand);
+ setOperationAction(ISD::SMIN, VT, Expand);
+ setOperationAction(ISD::UMAX, VT, Expand);
+ setOperationAction(ISD::UMIN, VT, Expand);
+ }
if (Subtarget.hasVSX()) {
setOperationAction(ISD::FMAXNUM, VT, Legal);
@@ -646,7 +652,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
- for (MVT InnerVT : MVT::vector_valuetypes()) {
+ for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
@@ -944,7 +950,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);
setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
- setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
@@ -1118,6 +1123,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::TRUNCATE);
+ setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+
if (Subtarget.useCRBits()) {
setTargetDAGCombine(ISD::TRUNCATE);
@@ -1172,9 +1179,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setJumpIsExpensive();
}
- setMinFunctionAlignment(2);
+ setMinFunctionAlignment(Align(4));
if (Subtarget.isDarwin())
- setPrefFunctionAlignment(4);
+ setPrefFunctionAlignment(Align(16));
switch (Subtarget.getDarwinDirective()) {
default: break;
@@ -1191,8 +1198,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
case PPC::DIR_PWR7:
case PPC::DIR_PWR8:
case PPC::DIR_PWR9:
- setPrefFunctionAlignment(4);
- setPrefLoopAlignment(4);
+ setPrefLoopAlignment(Align(16));
+ setPrefFunctionAlignment(Align(16));
break;
}
@@ -1352,6 +1359,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::SExtVElems: return "PPCISD::SExtVElems";
case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
case PPCISD::STXVD2X: return "PPCISD::STXVD2X";
+ case PPCISD::LOAD_VEC_BE: return "PPCISD::LOAD_VEC_BE";
+ case PPCISD::STORE_VEC_BE: return "PPCISD::STORE_VEC_BE";
case PPCISD::ST_VSR_SCAL_INT:
return "PPCISD::ST_VSR_SCAL_INT";
case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
@@ -1396,7 +1405,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE";
case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";
case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";
- case PPCISD::FP_EXTEND_LH: return "PPCISD::FP_EXTEND_LH";
+ case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF";
+ case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";
}
return nullptr;
}
@@ -1517,7 +1527,7 @@ bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
SelectionDAG &DAG) {
const PPCSubtarget& Subtarget =
- static_cast<const PPCSubtarget&>(DAG.getSubtarget());
+ static_cast<const PPCSubtarget&>(DAG.getSubtarget());
if (!Subtarget.hasP8Vector())
return false;
@@ -1769,10 +1779,10 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a splat of a single element that is suitable for input to
-/// VSPLTB/VSPLTH/VSPLTW.
+/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
- assert(N->getValueType(0) == MVT::v16i8 &&
- (EltSize == 1 || EltSize == 2 || EltSize == 4));
+ assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) &&
+ EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");
// The consecutive indices need to specify an element, not part of two
// different elements. So abandon ship early if this isn't the case.
@@ -2065,10 +2075,11 @@ bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
}
-/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
-/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
-unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
- SelectionDAG &DAG) {
+/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
+/// appropriate for PPC mnemonics (which have a big endian bias - namely
+/// elements are counted from the left of the vector register).
+unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
+ SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
assert(isSplatShuffleMask(SVOp, EltSize));
if (DAG.getDataLayout().isLittleEndian())
@@ -2667,12 +2678,14 @@ static void setUsesTOCBasePtr(SelectionDAG &DAG) {
setUsesTOCBasePtr(DAG.getMachineFunction());
}
-static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit,
- SDValue GA) {
+SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
+ SDValue GA) const {
+ const bool Is64Bit = Subtarget.isPPC64();
EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
- SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) :
- DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
-
+ SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT)
+ : Subtarget.isAIXABI()
+ ? DAG.getRegister(PPC::R2, VT)
+ : DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
SDValue Ops[] = { GA, Reg };
return DAG.getMemIntrinsicNode(
PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
@@ -2688,10 +2701,10 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
// 64-bit SVR4 ABI code is always position-independent.
// The actual address of the GlobalValue is stored in the TOC.
- if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+ if (Subtarget.is64BitELFABI()) {
setUsesTOCBasePtr(DAG);
SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
- return getTOCEntry(DAG, SDLoc(CP), true, GA);
+ return getTOCEntry(DAG, SDLoc(CP), GA);
}
unsigned MOHiFlag, MOLoFlag;
@@ -2701,7 +2714,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
if (IsPIC && Subtarget.isSVR4ABI()) {
SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
PPCII::MO_PIC_FLAG);
- return getTOCEntry(DAG, SDLoc(CP), false, GA);
+ return getTOCEntry(DAG, SDLoc(CP), GA);
}
SDValue CPIHi =
@@ -2764,10 +2777,10 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
// 64-bit SVR4 ABI code is always position-independent.
// The actual address of the GlobalValue is stored in the TOC.
- if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+ if (Subtarget.is64BitELFABI()) {
setUsesTOCBasePtr(DAG);
SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
- return getTOCEntry(DAG, SDLoc(JT), true, GA);
+ return getTOCEntry(DAG, SDLoc(JT), GA);
}
unsigned MOHiFlag, MOLoFlag;
@@ -2777,7 +2790,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
if (IsPIC && Subtarget.isSVR4ABI()) {
SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
PPCII::MO_PIC_FLAG);
- return getTOCEntry(DAG, SDLoc(GA), false, GA);
+ return getTOCEntry(DAG, SDLoc(GA), GA);
}
SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
@@ -2793,14 +2806,18 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
// 64-bit SVR4 ABI code is always position-independent.
// The actual BlockAddress is stored in the TOC.
- if (Subtarget.isSVR4ABI() &&
- (Subtarget.isPPC64() || isPositionIndependent())) {
- if (Subtarget.isPPC64())
- setUsesTOCBasePtr(DAG);
+ if (Subtarget.is64BitELFABI()) {
+ setUsesTOCBasePtr(DAG);
SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
- return getTOCEntry(DAG, SDLoc(BASDN), Subtarget.isPPC64(), GA);
+ return getTOCEntry(DAG, SDLoc(BASDN), GA);
}
+ // 32-bit position-independent ELF stores the BlockAddress in the .got.
+ if (Subtarget.is32BitELFABI() && isPositionIndependent())
+ return getTOCEntry(
+ DAG, SDLoc(BASDN),
+ DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()));
+
unsigned MOHiFlag, MOLoFlag;
bool IsPIC = isPositionIndependent();
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
@@ -2913,12 +2930,12 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
SDLoc DL(GSDN);
const GlobalValue *GV = GSDN->getGlobal();
- // 64-bit SVR4 ABI code is always position-independent.
+ // 64-bit SVR4 ABI & AIX ABI code is always position-independent.
// The actual address of the GlobalValue is stored in the TOC.
- if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+ if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
setUsesTOCBasePtr(DAG);
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
- return getTOCEntry(DAG, DL, true, GA);
+ return getTOCEntry(DAG, DL, GA);
}
unsigned MOHiFlag, MOLoFlag;
@@ -2929,7 +2946,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
GSDN->getOffset(),
PPCII::MO_PIC_FLAG);
- return getTOCEntry(DAG, DL, false, GA);
+ return getTOCEntry(DAG, DL, GA);
}
SDValue GAHi =
@@ -3235,8 +3252,8 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachinePointerInfo(SV, nextOffset));
}
-/// FPR - The set of FP registers that should be allocated for arguments,
-/// on Darwin.
+/// FPR - The set of FP registers that should be allocated for arguments
+/// on Darwin and AIX.
static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
PPC::F11, PPC::F12, PPC::F13};
@@ -3377,17 +3394,17 @@ SDValue PPCTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
- if (Subtarget.isSVR4ABI()) {
- if (Subtarget.isPPC64())
- return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
- dl, DAG, InVals);
- else
- return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
- dl, DAG, InVals);
- } else {
- return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
- dl, DAG, InVals);
- }
+ if (Subtarget.is64BitELFABI())
+ return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
+ InVals);
+ else if (Subtarget.is32BitELFABI())
+ return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
+ InVals);
+
+ // FIXME: We are using this for both AIX and Darwin. We should add appropriate
+ // AIX testing, and rename it appropriately.
+ return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG,
+ InVals);
}
SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
@@ -3467,7 +3484,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
if (Subtarget.hasP8Vector())
RC = &PPC::VSSRCRegClass;
else if (Subtarget.hasSPE())
- RC = &PPC::SPE4RCRegClass;
+ RC = &PPC::GPRCRegClass;
else
RC = &PPC::F4RCRegClass;
break;
@@ -4516,7 +4533,7 @@ callsShareTOCBase(const Function *Caller, SDValue Callee,
static bool
needStackSlotPassParameters(const PPCSubtarget &Subtarget,
const SmallVectorImpl<ISD::OutputArg> &Outs) {
- assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64());
+ assert(Subtarget.is64BitELFABI());
const unsigned PtrByteSize = 8;
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
@@ -4926,7 +4943,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
ImmutableCallSite CS, const PPCSubtarget &Subtarget) {
bool isPPC64 = Subtarget.isPPC64();
bool isSVR4ABI = Subtarget.isSVR4ABI();
- bool isELFv2ABI = Subtarget.isELFv2ABI();
+ bool is64BitELFv1ABI = isPPC64 && isSVR4ABI && !Subtarget.isELFv2ABI();
bool isAIXABI = Subtarget.isAIXABI();
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
@@ -4997,7 +5014,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
// to do the call, we can't use PPCISD::CALL.
SDValue MTCTROps[] = {Chain, Callee, InFlag};
- if (isSVR4ABI && isPPC64 && !isELFv2ABI) {
+ if (is64BitELFv1ABI) {
// Function pointers in the 64-bit SVR4 ABI do not point to the function
// entry point, but to the function descriptor (the function entry point
// address is part of the function descriptor though).
@@ -5085,7 +5102,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
CallOpc = PPCISD::BCTRL;
Callee.setNode(nullptr);
// Add use of X11 (holding environment pointer)
- if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest)
+ if (is64BitELFv1ABI && !hasNest)
Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
// Add CTR register as callee so a bctr can be emitted later.
if (isTailCall)
@@ -6730,8 +6747,12 @@ SDValue PPCTargetLowering::LowerCall_AIX(
const unsigned NumGPRs = isPPC64 ? array_lengthof(GPR_64)
: array_lengthof(GPR_32);
+ const unsigned NumFPRs = array_lengthof(FPR);
+ assert(NumFPRs == 13 && "Only FPR 1-13 could be used for parameter passing "
+ "on AIX");
+
const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
- unsigned GPR_idx = 0;
+ unsigned GPR_idx = 0, FPR_idx = 0;
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
@@ -6768,6 +6789,20 @@ SDValue PPCTargetLowering::LowerCall_AIX(
break;
case MVT::f32:
case MVT::f64:
+ if (FPR_idx != NumFPRs) {
+ RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+ // If we have any FPRs remaining, we may also have GPRs remaining.
+ // Args passed in FPRs consume 1 or 2 (f64 in 32 bit mode) available
+ // GPRs.
+ if (GPR_idx != NumGPRs)
+ ++GPR_idx;
+ if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64)
+ ++GPR_idx;
+ } else
+ report_fatal_error("Handling of placing parameters on the stack is "
+ "unimplemented!");
+ break;
case MVT::v4f32:
case MVT::v4i32:
case MVT::v8i16:
@@ -8152,6 +8187,18 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
Op0.getOperand(1));
}
+static const SDValue *getNormalLoadInput(const SDValue &Op) {
+ const SDValue *InputLoad = &Op;
+ if (InputLoad->getOpcode() == ISD::BITCAST)
+ InputLoad = &InputLoad->getOperand(0);
+ if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR)
+ InputLoad = &InputLoad->getOperand(0);
+ if (InputLoad->getOpcode() != ISD::LOAD)
+ return nullptr;
+ LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
+ return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
+}
+
// If this is a case we can't handle, return null and let the default
// expansion code take care of it. If we CAN select this case, and if it
// selects to a single instruction, return Op. Otherwise, if we can codegen
@@ -8274,6 +8321,34 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
SplatBitSize > 32) {
+
+ const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
+ // Handle load-and-splat patterns as we have instructions that will do this
+ // in one go.
+ if (InputLoad && DAG.isSplatValue(Op, true)) {
+ LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
+
+ // We have handling for 4 and 8 byte elements.
+ unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits();
+
+ // Checking for a single use of this load, we have to check for vector
+ // width (128 bits) / ElementSize uses (since each operand of the
+ // BUILD_VECTOR is a separate use of the value.
+ if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) &&
+ ((Subtarget.hasVSX() && ElementSize == 64) ||
+ (Subtarget.hasP9Vector() && ElementSize == 32))) {
+ SDValue Ops[] = {
+ LD->getChain(), // Chain
+ LD->getBasePtr(), // Ptr
+ DAG.getValueType(Op.getValueType()) // VT
+ };
+ return
+ DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl,
+ DAG.getVTList(Op.getValueType(), MVT::Other),
+ Ops, LD->getMemoryVT(), LD->getMemOperand());
+ }
+ }
+
// BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
// lowered to VSX instructions under certain conditions.
// Without VSX, there is no pattern more efficient than expanding the node.
@@ -8759,6 +8834,45 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
unsigned ShiftElts, InsertAtByte;
bool Swap = false;
+
+ // If this is a load-and-splat, we can do that with a single instruction
+ // in some cases. However if the load has multiple uses, we don't want to
+ // combine it because that will just produce multiple loads.
+ const SDValue *InputLoad = getNormalLoadInput(V1);
+ if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
+ (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
+ InputLoad->hasOneUse()) {
+ bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);
+ int SplatIdx =
+ PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
+
+ LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
+ // For 4-byte load-and-splat, we need Power9.
+ if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
+ uint64_t Offset = 0;
+ if (IsFourByte)
+ Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
+ else
+ Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
+ SDValue BasePtr = LD->getBasePtr();
+ if (Offset != 0)
+ BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+ BasePtr, DAG.getIntPtrConstant(Offset, dl));
+ SDValue Ops[] = {
+ LD->getChain(), // Chain
+ BasePtr, // BasePtr
+ DAG.getValueType(Op.getValueType()) // VT
+ };
+ SDVTList VTL =
+ DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other);
+ SDValue LdSplt =
+ DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,
+ Ops, LD->getMemoryVT(), LD->getMemOperand());
+ if (LdSplt.getValueType() != SVOp->getValueType(0))
+ LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);
+ return LdSplt;
+ }
+ }
if (Subtarget.hasP9Vector() &&
PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
isLittleEndian)) {
@@ -8835,7 +8949,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
if (Subtarget.hasVSX()) {
if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
- int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
+ int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG);
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
@@ -9880,6 +9994,30 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
switch (Op0.getOpcode()) {
default:
return SDValue();
+ case ISD::EXTRACT_SUBVECTOR: {
+ assert(Op0.getNumOperands() == 2 &&
+ isa<ConstantSDNode>(Op0->getOperand(1)) &&
+ "Node should have 2 operands with second one being a constant!");
+
+ if (Op0.getOperand(0).getValueType() != MVT::v4f32)
+ return SDValue();
+
+ // Custom lower is only done for high or low doubleword.
+ int Idx = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+ if (Idx % 2 != 0)
+ return SDValue();
+
+ // Since input is v4f32, at this point Idx is either 0 or 2.
+ // Shift to get the doubleword position we want.
+ int DWord = Idx >> 1;
+
+ // High and low word positions are different on little endian.
+ if (Subtarget.isLittleEndian())
+ DWord ^= 0x1;
+
+ return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64,
+ Op0.getOperand(0), DAG.getConstant(DWord, dl, MVT::i32));
+ }
case ISD::FADD:
case ISD::FMUL:
case ISD::FSUB: {
@@ -9891,26 +10029,25 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
// Generate new load node.
LoadSDNode *LD = cast<LoadSDNode>(LdOp);
- SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() };
- NewLoad[i] =
- DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl,
- DAG.getVTList(MVT::v4f32, MVT::Other),
- LoadOps, LD->getMemoryVT(),
- LD->getMemOperand());
- }
- SDValue NewOp = DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32,
- NewLoad[0], NewLoad[1],
- Op0.getNode()->getFlags());
- return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewOp);
+ SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
+ NewLoad[i] = DAG.getMemIntrinsicNode(
+ PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
+ LD->getMemoryVT(), LD->getMemOperand());
+ }
+ SDValue NewOp =
+ DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, NewLoad[0],
+ NewLoad[1], Op0.getNode()->getFlags());
+ return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewOp,
+ DAG.getConstant(0, dl, MVT::i32));
}
case ISD::LOAD: {
LoadSDNode *LD = cast<LoadSDNode>(Op0);
- SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() };
- SDValue NewLd =
- DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl,
- DAG.getVTList(MVT::v4f32, MVT::Other),
- LoadOps, LD->getMemoryVT(), LD->getMemOperand());
- return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewLd);
+ SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
+ SDValue NewLd = DAG.getMemIntrinsicNode(
+ PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
+ LD->getMemoryVT(), LD->getMemOperand());
+ return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewLd,
+ DAG.getConstant(0, dl, MVT::i32));
}
}
llvm_unreachable("ERROR:Should return for all cases within swtich.");
@@ -10048,9 +10185,11 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
return;
case ISD::TRUNCATE: {
EVT TrgVT = N->getValueType(0);
+ EVT OpVT = N->getOperand(0).getValueType();
if (TrgVT.isVector() &&
isOperationCustom(N->getOpcode(), TrgVT) &&
- N->getOperand(0).getValueType().getSizeInBits() <= 128)
+ OpVT.getSizeInBits() <= 128 &&
+ isPowerOf2_32(OpVT.getVectorElementType().getSizeInBits()))
Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));
return;
}
@@ -10192,7 +10331,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
if (CmpOpcode) {
// Signed comparisons of byte or halfword values must be sign-extended.
if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
- unsigned ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+ Register ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
ExtReg).addReg(dest);
BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
@@ -10243,10 +10382,10 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
MachineFunction *F = BB->getParent();
MachineFunction::iterator It = ++BB->getIterator();
- unsigned dest = MI.getOperand(0).getReg();
- unsigned ptrA = MI.getOperand(1).getReg();
- unsigned ptrB = MI.getOperand(2).getReg();
- unsigned incr = MI.getOperand(3).getReg();
+ Register dest = MI.getOperand(0).getReg();
+ Register ptrA = MI.getOperand(1).getReg();
+ Register ptrB = MI.getOperand(2).getReg();
+ Register incr = MI.getOperand(3).getReg();
DebugLoc dl = MI.getDebugLoc();
MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -10364,7 +10503,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
if (CmpOpcode) {
// For unsigned comparisons, we can directly compare the shifted values.
// For signed comparisons we shift and sign extend.
- unsigned SReg = RegInfo.createVirtualRegister(GPRC);
+ Register SReg = RegInfo.createVirtualRegister(GPRC);
BuildMI(BB, dl, TII->get(PPC::AND), SReg)
.addReg(TmpDestReg)
.addReg(MaskReg);
@@ -10375,7 +10514,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)
.addReg(SReg)
.addReg(ShiftReg);
- unsigned ValueSReg = RegInfo.createVirtualRegister(GPRC);
+ Register ValueSReg = RegInfo.createVirtualRegister(GPRC);
BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)
.addReg(ValueReg);
ValueReg = ValueSReg;
@@ -10426,11 +10565,11 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
const BasicBlock *BB = MBB->getBasicBlock();
MachineFunction::iterator I = ++MBB->getIterator();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
- unsigned mainDstReg = MRI.createVirtualRegister(RC);
- unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+ Register mainDstReg = MRI.createVirtualRegister(RC);
+ Register restoreDstReg = MRI.createVirtualRegister(RC);
MVT PVT = getPointerTy(MF->getDataLayout());
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
@@ -10482,10 +10621,10 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
// Prepare IP either in reg.
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
- unsigned LabelReg = MRI.createVirtualRegister(PtrRC);
- unsigned BufReg = MI.getOperand(1).getReg();
+ Register LabelReg = MRI.createVirtualRegister(PtrRC);
+ Register BufReg = MI.getOperand(1).getReg();
- if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) {
+ if (Subtarget.is64BitELFABI()) {
setUsesTOCBasePtr(*MBB->getParent());
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
.addReg(PPC::X2)
@@ -10570,7 +10709,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
const TargetRegisterClass *RC =
(PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
- unsigned Tmp = MRI.createVirtualRegister(RC);
+ Register Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
@@ -10587,7 +10726,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
const int64_t TOCOffset = 3 * PVT.getStoreSize();
const int64_t BPOffset = 4 * PVT.getStoreSize();
- unsigned BufReg = MI.getOperand(0).getReg();
+ Register BufReg = MI.getOperand(0).getReg();
// Reload FP (the jumped-to function may not have had a
// frame pointer, and if so, then its r31 will be restored
@@ -10662,7 +10801,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
if (MI.getOpcode() == TargetOpcode::STACKMAP ||
MI.getOpcode() == TargetOpcode::PATCHPOINT) {
- if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() &&
+ if (Subtarget.is64BitELFABI() &&
MI.getOpcode() == TargetOpcode::PATCHPOINT) {
// Call lowering should have added an r2 operand to indicate a dependence
// on the TOC base pointer value. It can't however, because there is no
@@ -10828,15 +10967,15 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
BB = readMBB;
MachineRegisterInfo &RegInfo = F->getRegInfo();
- unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
- unsigned LoReg = MI.getOperand(0).getReg();
- unsigned HiReg = MI.getOperand(1).getReg();
+ Register ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+ Register LoReg = MI.getOperand(0).getReg();
+ Register HiReg = MI.getOperand(1).getReg();
BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);
- unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
+ Register CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
.addReg(HiReg)
@@ -10978,11 +11117,11 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
StoreMnemonic = PPC::STDCX;
break;
}
- unsigned dest = MI.getOperand(0).getReg();
- unsigned ptrA = MI.getOperand(1).getReg();
- unsigned ptrB = MI.getOperand(2).getReg();
- unsigned oldval = MI.getOperand(3).getReg();
- unsigned newval = MI.getOperand(4).getReg();
+ Register dest = MI.getOperand(0).getReg();
+ Register ptrA = MI.getOperand(1).getReg();
+ Register ptrB = MI.getOperand(2).getReg();
+ Register oldval = MI.getOperand(3).getReg();
+ Register newval = MI.getOperand(4).getReg();
DebugLoc dl = MI.getDebugLoc();
MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -11057,11 +11196,11 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
bool isLittleEndian = Subtarget.isLittleEndian();
bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
- unsigned dest = MI.getOperand(0).getReg();
- unsigned ptrA = MI.getOperand(1).getReg();
- unsigned ptrB = MI.getOperand(2).getReg();
- unsigned oldval = MI.getOperand(3).getReg();
- unsigned newval = MI.getOperand(4).getReg();
+ Register dest = MI.getOperand(0).getReg();
+ Register ptrA = MI.getOperand(1).getReg();
+ Register ptrB = MI.getOperand(2).getReg();
+ Register oldval = MI.getOperand(3).getReg();
+ Register newval = MI.getOperand(4).getReg();
DebugLoc dl = MI.getDebugLoc();
MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
@@ -11238,13 +11377,13 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// This pseudo performs an FADD with rounding mode temporarily forced
// to round-to-zero. We emit this via custom inserter since the FPSCR
// is not modeled at the SelectionDAG level.
- unsigned Dest = MI.getOperand(0).getReg();
- unsigned Src1 = MI.getOperand(1).getReg();
- unsigned Src2 = MI.getOperand(2).getReg();
+ Register Dest = MI.getOperand(0).getReg();
+ Register Src1 = MI.getOperand(1).getReg();
+ Register Src2 = MI.getOperand(2).getReg();
DebugLoc dl = MI.getDebugLoc();
MachineRegisterInfo &RegInfo = F->getRegInfo();
- unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
+ Register MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
// Save FPSCR value.
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
@@ -11270,7 +11409,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8);
MachineRegisterInfo &RegInfo = F->getRegInfo();
- unsigned Dest = RegInfo.createVirtualRegister(
+ Register Dest = RegInfo.createVirtualRegister(
Opcode == PPC::ANDIo ? &PPC::GPRCRegClass : &PPC::G8RCRegClass);
DebugLoc dl = MI.getDebugLoc();
@@ -11283,7 +11422,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
} else if (MI.getOpcode() == PPC::TCHECK_RET) {
DebugLoc Dl = MI.getDebugLoc();
MachineRegisterInfo &RegInfo = F->getRegInfo();
- unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
+ Register CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
MI.getOperand(0).getReg())
@@ -11297,7 +11436,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addReg(PPC::CR0EQ);
} else if (MI.getOpcode() == PPC::SETRNDi) {
DebugLoc dl = MI.getDebugLoc();
- unsigned OldFPSCRReg = MI.getOperand(0).getReg();
+ Register OldFPSCRReg = MI.getOperand(0).getReg();
// Save FPSCR value.
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
@@ -11378,7 +11517,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
};
- unsigned OldFPSCRReg = MI.getOperand(0).getReg();
+ Register OldFPSCRReg = MI.getOperand(0).getReg();
// Save FPSCR value.
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
@@ -11393,12 +11532,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// mtfsf 255, NewFPSCRReg
MachineOperand SrcOp = MI.getOperand(1);
MachineRegisterInfo &RegInfo = F->getRegInfo();
- unsigned OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+ Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);
- unsigned ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
- unsigned ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+ Register ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+ Register ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
// The first operand of INSERT_SUBREG should be a register which has
// subregisters, we only care about its RegClass, so we should use an
@@ -11409,14 +11548,14 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.add(SrcOp)
.addImm(1);
- unsigned NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
+ Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg)
.addReg(OldFPSCRTmpReg)
.addReg(ExtSrcReg)
.addImm(0)
.addImm(62);
- unsigned NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
+ Register NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);
// The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
@@ -13113,6 +13252,61 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
return Val;
}
+SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
+ LSBaseSDNode *LSBase,
+ DAGCombinerInfo &DCI) const {
+ assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) &&
+ "Not a reverse memop pattern!");
+
+ auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {
+ auto Mask = SVN->getMask();
+ int i = 0;
+ auto I = Mask.rbegin();
+ auto E = Mask.rend();
+
+ for (; I != E; ++I) {
+ if (*I != i)
+ return false;
+ i++;
+ }
+ return true;
+ };
+
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = SVN->getValueType(0);
+
+ if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX())
+ return SDValue();
+
+ // Before P9, we have PPCVSXSwapRemoval pass to hack the element order.
+ // See comment in PPCVSXSwapRemoval.cpp.
+ // It is conflict with PPCVSXSwapRemoval opt. So we don't do it.
+ if (!Subtarget.hasP9Vector())
+ return SDValue();
+
+ if(!IsElementReverse(SVN))
+ return SDValue();
+
+ if (LSBase->getOpcode() == ISD::LOAD) {
+ SDLoc dl(SVN);
+ SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
+ return DAG.getMemIntrinsicNode(
+ PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps,
+ LSBase->getMemoryVT(), LSBase->getMemOperand());
+ }
+
+ if (LSBase->getOpcode() == ISD::STORE) {
+ SDLoc dl(LSBase);
+ SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0),
+ LSBase->getBasePtr()};
+ return DAG.getMemIntrinsicNode(
+ PPCISD::STORE_VEC_BE, dl, DAG.getVTList(MVT::Other), StoreOps,
+ LSBase->getMemoryVT(), LSBase->getMemOperand());
+ }
+
+ llvm_unreachable("Expected a load or store node here");
+}
+
SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -13159,6 +13353,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return combineFPToIntToFP(N, DCI);
+ case ISD::VECTOR_SHUFFLE:
+ if (ISD::isNormalLoad(N->getOperand(0).getNode())) {
+ LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));
+ return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);
+ }
+ break;
case ISD::STORE: {
EVT Op1VT = N->getOperand(1).getValueType();
@@ -13170,6 +13370,13 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
return Val;
}
+ if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N->getOperand(1));
+ SDValue Val= combineVReverseMemOP(SVN, cast<LSBaseSDNode>(N), DCI);
+ if (Val)
+ return Val;
+ }
+
// Turn STORE (BSWAP) -> sthbrx/stwbrx.
if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&
N->getOperand(1).getNode()->hasOneUse() &&
@@ -13903,7 +14110,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
}
-unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
switch (Subtarget.getDarwinDirective()) {
default: break;
case PPC::DIR_970:
@@ -13924,7 +14131,7 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
// Actual alignment of the loop will depend on the hotness check and other
// logic in alignBlocks.
if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
- return 5;
+ return Align(32);
}
const PPCInstrInfo *TII = Subtarget.getInstrInfo();
@@ -13940,7 +14147,7 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
}
if (LoopSize > 16 && LoopSize <= 32)
- return 5;
+ return Align(32);
break;
}
@@ -14063,7 +14270,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case 'f':
if (Subtarget.hasSPE()) {
if (VT == MVT::f32 || VT == MVT::i32)
- return std::make_pair(0U, &PPC::SPE4RCRegClass);
+ return std::make_pair(0U, &PPC::GPRCRegClass);
if (VT == MVT::f64 || VT == MVT::i64)
return std::make_pair(0U, &PPC::SPERCRegClass);
} else {
@@ -14306,22 +14513,22 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const {
+Register PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const {
bool isPPC64 = Subtarget.isPPC64();
- bool isDarwinABI = Subtarget.isDarwinABI();
+ bool IsDarwinABI = Subtarget.isDarwinABI();
if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) ||
(!isPPC64 && VT != MVT::i32))
report_fatal_error("Invalid register global variable type");
bool is64Bit = isPPC64 && VT == MVT::i64;
- unsigned Reg = StringSwitch<unsigned>(RegName)
+ Register Reg = StringSwitch<Register>(RegName)
.Case("r1", is64Bit ? PPC::X1 : PPC::R1)
- .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2)
- .Case("r13", (!isPPC64 && isDarwinABI) ? 0 :
+ .Case("r2", (IsDarwinABI || isPPC64) ? Register() : PPC::R2)
+ .Case("r13", (!isPPC64 && IsDarwinABI) ? Register() :
(is64Bit ? PPC::X13 : PPC::R13))
- .Default(0);
+ .Default(Register());
if (Reg)
return Reg;
@@ -14330,14 +14537,17 @@ unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
// 32-bit SVR4 ABI access everything as got-indirect.
- if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
+ if (Subtarget.is32BitELFABI())
+ return true;
+
+ // AIX accesses everything indirectly through the TOC, which is similar to
+ // the GOT.
+ if (Subtarget.isAIXABI())
return true;
CodeModel::Model CModel = getTargetMachine().getCodeModel();
// If it is small or large code model, module locals are accessed
- // indirectly by loading their address from .toc/.got. The difference
- // is that for large code model we have ADDISTocHa + LDtocL and for
- // small code model we simply have LDtoc.
+ // indirectly by loading their address from .toc/.got.
if (CModel == CodeModel::Small || CModel == CodeModel::Large)
return true;
@@ -14345,14 +14555,8 @@ bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA))
return true;
- if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
- const GlobalValue *GV = G->getGlobal();
- unsigned char GVFlags = Subtarget.classifyGlobalReference(GV);
- // The NLP flag indicates that a global access has to use an
- // extra indirection.
- if (GVFlags & PPCII::MO_NLP_FLAG)
- return true;
- }
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA))
+ return Subtarget.isGVIndirectSymbol(G->getGlobal());
return false;
}
@@ -14417,7 +14621,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(0);
Info.offset = -VT.getStoreSize()+1;
Info.size = 2*VT.getStoreSize()-1;
- Info.align = 1;
+ Info.align = Align::None();
Info.flags = MachineMemOperand::MOLoad;
return true;
}
@@ -14451,7 +14655,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.size = VT.getStoreSize();
- Info.align = 1;
+ Info.align = Align::None();
Info.flags = MachineMemOperand::MOLoad;
return true;
}
@@ -14503,7 +14707,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(1);
Info.offset = -VT.getStoreSize()+1;
Info.size = 2*VT.getStoreSize()-1;
- Info.align = 1;
+ Info.align = Align::None();
Info.flags = MachineMemOperand::MOStore;
return true;
}
@@ -14536,7 +14740,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
Info.size = VT.getStoreSize();
- Info.align = 1;
+ Info.align = Align::None();
Info.flags = MachineMemOperand::MOStore;
return true;
}
@@ -14786,7 +14990,7 @@ void PPCTargetLowering::insertCopiesSplitCSR(
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
- unsigned NewVR = MRI->createVirtualRegister(RC);
+ Register NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
@@ -15146,7 +15350,7 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
// Only duplicate to increase tail-calls for the 64bit SysV ABIs.
- if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64())
+ if (!Subtarget.is64BitELFABI())
return false;
// If not a tail call then no need to proceed.
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 97422c6eda36..62922ea2d4c4 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -412,8 +412,9 @@ namespace llvm {
/// representation.
QBFLT,
- /// Custom extend v4f32 to v2f64.
- FP_EXTEND_LH,
+ /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or
+ /// lower (IDX=1) half of v4f32 to v2f64.
+ FP_EXTEND_HALF,
/// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
/// byte-swapping store instruction. It byte-swaps the low "Type" bits of
@@ -456,15 +457,29 @@ namespace llvm {
/// an xxswapd.
LXVD2X,
+ /// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian.
+ /// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on
+ /// the vector type to load vector in big-endian element order.
+ LOAD_VEC_BE,
+
/// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a
/// v2f32 value into the lower half of a VSR register.
LD_VSX_LH,
+ /// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory
+ /// instructions such as LXVDSX, LXVWSX.
+ LD_SPLAT,
+
/// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
/// Maps directly to an stxvd2x instruction that will be preceded by
/// an xxswapd.
STXVD2X,
+ /// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian.
+ /// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on
+ /// the vector type to store vector in big-endian element order.
+ STORE_VEC_BE,
+
/// Store scalar integers from VSR.
ST_VSR_SCAL_INT,
@@ -563,9 +578,11 @@ namespace llvm {
bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
unsigned &InsertAtByte, bool &Swap, bool IsLE);
- /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
- /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
- unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG);
+ /// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
+ /// appropriate for PPC mnemonics (which have a big endian bias - namely
+ /// elements are counted from the left of the vector register).
+ unsigned getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
+ SelectionDAG &DAG);
/// get_VSPLTI_elt - If this is a build_vector of constants which can be
/// formed by using a vspltis[bhw] instruction of the specified element
@@ -716,8 +733,8 @@ namespace llvm {
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const override;
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
void computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
@@ -725,7 +742,7 @@ namespace llvm {
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
- unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
+ Align getPrefLoopAlignment(MachineLoop *ML) const override;
bool shouldInsertFencesForAtomic(const Instruction *I) const override {
return true;
@@ -834,6 +851,18 @@ namespace llvm {
return true;
}
+ bool isDesirableToTransformToIntegerOp(unsigned Opc,
+ EVT VT) const override {
+ // Only handle float load/store pair because float(fpr) load/store
+ // instruction has more cycles than integer(gpr) load/store in PPC.
+ if (Opc != ISD::LOAD && Opc != ISD::STORE)
+ return false;
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return false;
+
+ return true;
+ }
+
// Returns true if the address of the global is stored in TOC entry.
bool isAccessedAsGotIndirect(SDValue N) const;
@@ -998,6 +1027,8 @@ namespace llvm {
SDValue &FPOpOut,
const SDLoc &dl) const;
+ SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, SDValue GA) const;
+
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
@@ -1155,6 +1186,8 @@ namespace llvm {
SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineVSelect(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase,
+ DAGCombinerInfo &DCI) const;
/// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
/// SETCC with integer subtraction when (1) there is a legal way of doing it
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index d598567f8e4e..f16187149d36 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1099,8 +1099,8 @@ def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
// Support for medium and large code model.
let hasSideEffects = 0 in {
let isReMaterializable = 1 in {
-def ADDIStocHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
- "#ADDIStocHA", []>, isPPC64;
+def ADDIStocHA8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+ "#ADDIStocHA8", []>, isPPC64;
def ADDItocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
"#ADDItocL", []>, isPPC64;
}
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 8176c5120a83..fd3fc2af2327 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -215,21 +215,21 @@ def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
// VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
- return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG), SDLoc(N));
+ return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 1, *CurDAG), SDLoc(N));
}]>;
def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
(vector_shuffle node:$lhs, node:$rhs), [{
return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
}], VSPLTB_get_imm>;
def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
- return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG), SDLoc(N));
+ return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 2, *CurDAG), SDLoc(N));
}]>;
def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
(vector_shuffle node:$lhs, node:$rhs), [{
return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
}], VSPLTH_get_imm>;
def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
- return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG), SDLoc(N));
+ return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 4, *CurDAG), SDLoc(N));
}]>;
def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
(vector_shuffle node:$lhs, node:$rhs), [{
@@ -331,7 +331,7 @@ class VXBX_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
class VXCR_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
: VXForm_CR<xo, (outs vrrc:$vD), (ins vrrc:$vA, u1imm:$ST, u4imm:$SIX),
!strconcat(opc, " $vD, $vA, $ST, $SIX"), IIC_VecFP,
- [(set Ty:$vD, (IntID Ty:$vA, imm:$ST, imm:$SIX))]>;
+ [(set Ty:$vD, (IntID Ty:$vA, timm:$ST, timm:$SIX))]>;
//===----------------------------------------------------------------------===//
// Instruction Definitions.
@@ -401,10 +401,10 @@ let isCodeGenOnly = 1 in {
def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
"mfvscr $vD", IIC_LdStStore,
- [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>;
+ [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>;
def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
"mtvscr $vB", IIC_LdStLoad,
- [(int_ppc_altivec_mtvscr v4i32:$vB)]>;
+ [(int_ppc_altivec_mtvscr v4i32:$vB)]>;
let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in { // Loads.
def LVEBX: XForm_1_memOp<31, 7, (outs vrrc:$vD), (ins memrr:$src),
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index a48eb1690695..96b9c9a119c0 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -1209,20 +1209,13 @@ class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
let Inst{31} = XT{5};
}
-class XX3Form_Zero<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+class XX3Form_SameOp<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: XX3Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
let XA = XT;
let XB = XT;
}
-class XX3Form_SetZero<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
- InstrItinClass itin, list<dag> pattern>
- : XX3Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
- let XB = XT;
- let XA = XT;
-}
-
class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: I<opcode, OOL, IOL, asmstr, itin> {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index a787bdd56b9d..6b10672965c9 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -90,7 +90,6 @@ enum SpillOpcodeKey {
SOK_QuadBitSpill,
SOK_SpillToVSR,
SOK_SPESpill,
- SOK_SPE4Spill,
SOK_LastOpcodeSpill // This must be last on the enum.
};
@@ -184,10 +183,10 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
return Latency;
const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
- unsigned Reg = DefMO.getReg();
+ Register Reg = DefMO.getReg();
bool IsRegCR;
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
const MachineRegisterInfo *MRI =
&DefMI.getParent()->getParent()->getRegInfo();
IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) ||
@@ -330,11 +329,13 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case PPC::LIS8:
case PPC::QVGPCI:
case PPC::ADDIStocHA:
+ case PPC::ADDIStocHA8:
case PPC::ADDItocL:
case PPC::LOAD_STACK_GUARD:
case PPC::XXLXORz:
case PPC::XXLXORspz:
case PPC::XXLXORdpz:
+ case PPC::XXLEQVOnes:
case PPC::V_SET0B:
case PPC::V_SET0H:
case PPC::V_SET0:
@@ -448,7 +449,8 @@ MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return &MI;
}
-bool PPCInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+bool PPCInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {
// For VSX A-Type FMA instructions, it is the first two operands that can be
// commuted, however, because the non-encoded tied input operand is listed
@@ -966,11 +968,11 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
getKillRegState(KillSrc);
return;
} else if (PPC::SPERCRegClass.contains(SrcReg) &&
- PPC::SPE4RCRegClass.contains(DestReg)) {
+ PPC::GPRCRegClass.contains(DestReg)) {
BuildMI(MBB, I, DL, get(PPC::EFSCFD), DestReg).addReg(SrcReg);
getKillRegState(KillSrc);
return;
- } else if (PPC::SPE4RCRegClass.contains(SrcReg) &&
+ } else if (PPC::GPRCRegClass.contains(SrcReg) &&
PPC::SPERCRegClass.contains(DestReg)) {
BuildMI(MBB, I, DL, get(PPC::EFDCFS), DestReg).addReg(SrcReg);
getKillRegState(KillSrc);
@@ -1009,8 +1011,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Opc = PPC::QVFMRb;
else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
Opc = PPC::CROR;
- else if (PPC::SPE4RCRegClass.contains(DestReg, SrcReg))
- Opc = PPC::OR;
else if (PPC::SPERCRegClass.contains(DestReg, SrcReg))
Opc = PPC::EVOR;
else
@@ -1043,8 +1043,6 @@ unsigned PPCInstrInfo::getStoreOpcodeForSpill(unsigned Reg,
OpcodeIndex = SOK_Float4Spill;
} else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
OpcodeIndex = SOK_SPESpill;
- } else if (PPC::SPE4RCRegClass.hasSubClassEq(RC)) {
- OpcodeIndex = SOK_SPE4Spill;
} else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
OpcodeIndex = SOK_CRSpill;
} else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
@@ -1083,8 +1081,6 @@ unsigned PPCInstrInfo::getStoreOpcodeForSpill(unsigned Reg,
OpcodeIndex = SOK_Float4Spill;
} else if (PPC::SPERCRegClass.contains(Reg)) {
OpcodeIndex = SOK_SPESpill;
- } else if (PPC::SPE4RCRegClass.contains(Reg)) {
- OpcodeIndex = SOK_SPE4Spill;
} else if (PPC::CRRCRegClass.contains(Reg)) {
OpcodeIndex = SOK_CRSpill;
} else if (PPC::CRBITRCRegClass.contains(Reg)) {
@@ -1133,8 +1129,6 @@ PPCInstrInfo::getLoadOpcodeForSpill(unsigned Reg,
OpcodeIndex = SOK_Float4Spill;
} else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
OpcodeIndex = SOK_SPESpill;
- } else if (PPC::SPE4RCRegClass.hasSubClassEq(RC)) {
- OpcodeIndex = SOK_SPE4Spill;
} else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
OpcodeIndex = SOK_CRSpill;
} else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
@@ -1173,8 +1167,6 @@ PPCInstrInfo::getLoadOpcodeForSpill(unsigned Reg,
OpcodeIndex = SOK_Float4Spill;
} else if (PPC::SPERCRegClass.contains(Reg)) {
OpcodeIndex = SOK_SPESpill;
- } else if (PPC::SPE4RCRegClass.contains(Reg)) {
- OpcodeIndex = SOK_SPE4Spill;
} else if (PPC::CRRCRegClass.contains(Reg)) {
OpcodeIndex = SOK_CRSpill;
} else if (PPC::CRBITRCRegClass.contains(Reg)) {
@@ -1648,7 +1640,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
return false;
int OpC = CmpInstr.getOpcode();
- unsigned CRReg = CmpInstr.getOperand(0).getReg();
+ Register CRReg = CmpInstr.getOperand(0).getReg();
// FP record forms set CR1 based on the exception status bits, not a
// comparison with zero.
@@ -1671,7 +1663,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
// Look through copies unless that gets us to a physical register.
unsigned ActualSrc = TRI->lookThruCopyLike(SrcReg, MRI);
- if (TargetRegisterInfo::isVirtualRegister(ActualSrc))
+ if (Register::isVirtualRegister(ActualSrc))
SrcReg = ActualSrc;
// Get the unique definition of SrcReg.
@@ -1937,7 +1929,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
// Rotates are expensive instructions. If we're emitting a record-form
// rotate that can just be an andi/andis, we should just emit that.
if (MIOpC == PPC::RLWINM || MIOpC == PPC::RLWINM8) {
- unsigned GPRRes = MI->getOperand(0).getReg();
+ Register GPRRes = MI->getOperand(0).getReg();
int64_t SH = MI->getOperand(2).getImm();
int64_t MB = MI->getOperand(3).getImm();
int64_t ME = MI->getOperand(4).getImm();
@@ -2122,7 +2114,7 @@ bool PPCInstrInfo::expandVSXMemPseudo(MachineInstr &MI) const {
llvm_unreachable("Unknown Operation!");
}
- unsigned TargetReg = MI.getOperand(0).getReg();
+ Register TargetReg = MI.getOperand(0).getReg();
unsigned Opcode;
if ((TargetReg >= PPC::F0 && TargetReg <= PPC::F31) ||
(TargetReg >= PPC::VSL0 && TargetReg <= PPC::VSL31))
@@ -2184,7 +2176,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return expandVSXMemPseudo(MI);
}
case PPC::SPILLTOVSR_LD: {
- unsigned TargetReg = MI.getOperand(0).getReg();
+ Register TargetReg = MI.getOperand(0).getReg();
if (PPC::VSFRCRegClass.contains(TargetReg)) {
MI.setDesc(get(PPC::DFLOADf64));
return expandPostRAPseudo(MI);
@@ -2194,7 +2186,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
case PPC::SPILLTOVSR_ST: {
- unsigned SrcReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(0).getReg();
if (PPC::VSFRCRegClass.contains(SrcReg)) {
NumStoreSPILLVSRRCAsVec++;
MI.setDesc(get(PPC::DFSTOREf64));
@@ -2206,7 +2198,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
case PPC::SPILLTOVSR_LDX: {
- unsigned TargetReg = MI.getOperand(0).getReg();
+ Register TargetReg = MI.getOperand(0).getReg();
if (PPC::VSFRCRegClass.contains(TargetReg))
MI.setDesc(get(PPC::LXSDX));
else
@@ -2214,7 +2206,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
case PPC::SPILLTOVSR_STX: {
- unsigned SrcReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(0).getReg();
if (PPC::VSFRCRegClass.contains(SrcReg)) {
NumStoreSPILLVSRRCAsVec++;
MI.setDesc(get(PPC::STXSDX));
@@ -2279,10 +2271,10 @@ void PPCInstrInfo::replaceInstrOperandWithImm(MachineInstr &MI,
int64_t Imm) const {
assert(MI.getOperand(OpNo).isReg() && "Operand must be a REG");
// Replace the REG with the Immediate.
- unsigned InUseReg = MI.getOperand(OpNo).getReg();
+ Register InUseReg = MI.getOperand(OpNo).getReg();
MI.getOperand(OpNo).ChangeToImmediate(Imm);
- if (empty(MI.implicit_operands()))
+ if (MI.implicit_operands().empty())
return;
// We need to make sure that the MI didn't have any implicit use
@@ -2328,6 +2320,23 @@ void PPCInstrInfo::replaceInstrWithLI(MachineInstr &MI,
.addImm(LII.Imm);
}
+MachineInstr *PPCInstrInfo::getDefMIPostRA(unsigned Reg, MachineInstr &MI,
+ bool &SeenIntermediateUse) const {
+ assert(!MI.getParent()->getParent()->getRegInfo().isSSA() &&
+ "Should be called after register allocation.");
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI;
+ It++;
+ SeenIntermediateUse = false;
+ for (; It != E; ++It) {
+ if (It->modifiesRegister(Reg, TRI))
+ return &*It;
+ if (It->readsRegister(Reg, TRI))
+ SeenIntermediateUse = true;
+ }
+ return nullptr;
+}
+
MachineInstr *PPCInstrInfo::getForwardingDefMI(
MachineInstr &MI,
unsigned &OpNoForForwarding,
@@ -2342,11 +2351,11 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
for (int i = 1, e = MI.getNumOperands(); i < e; i++) {
if (!MI.getOperand(i).isReg())
continue;
- unsigned Reg = MI.getOperand(i).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ Register Reg = MI.getOperand(i).getReg();
+ if (!Register::isVirtualRegister(Reg))
continue;
unsigned TrueReg = TRI->lookThruCopyLike(Reg, MRI);
- if (TargetRegisterInfo::isVirtualRegister(TrueReg)) {
+ if (Register::isVirtualRegister(TrueReg)) {
DefMI = MRI->getVRegDef(TrueReg);
if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8) {
OpNoForForwarding = i;
@@ -2370,7 +2379,10 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
Opc == PPC::RLDICL_32 || Opc == PPC::RLDICL_32_64 ||
Opc == PPC::RLWINM || Opc == PPC::RLWINMo ||
Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o;
- if (!instrHasImmForm(MI, III, true) && !ConvertibleImmForm)
+ bool IsVFReg = (MI.getNumOperands() && MI.getOperand(0).isReg())
+ ? isVFRegister(MI.getOperand(0).getReg())
+ : false;
+ if (!ConvertibleImmForm && !instrHasImmForm(Opc, IsVFReg, III, true))
return nullptr;
// Don't convert or %X, %Y, %Y since that's just a register move.
@@ -2381,29 +2393,24 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
MachineOperand &MO = MI.getOperand(i);
SeenIntermediateUse = false;
if (MO.isReg() && MO.isUse() && !MO.isImplicit()) {
- MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI;
- It++;
- unsigned Reg = MI.getOperand(i).getReg();
-
- // Is this register defined by some form of add-immediate (including
- // load-immediate) within this basic block?
- for ( ; It != E; ++It) {
- if (It->modifiesRegister(Reg, &getRegisterInfo())) {
- switch (It->getOpcode()) {
- default: break;
- case PPC::LI:
- case PPC::LI8:
- case PPC::ADDItocL:
- case PPC::ADDI:
- case PPC::ADDI8:
- OpNoForForwarding = i;
- return &*It;
- }
+ Register Reg = MI.getOperand(i).getReg();
+ // If we see another use of this reg between the def and the MI,
+ // we want to flat it so the def isn't deleted.
+ MachineInstr *DefMI = getDefMIPostRA(Reg, MI, SeenIntermediateUse);
+ if (DefMI) {
+ // Is this register defined by some form of add-immediate (including
+ // load-immediate) within this basic block?
+ switch (DefMI->getOpcode()) {
+ default:
break;
- } else if (It->readsRegister(Reg, &getRegisterInfo()))
- // If we see another use of this reg between the def and the MI,
- // we want to flat it so the def isn't deleted.
- SeenIntermediateUse = true;
+ case PPC::LI:
+ case PPC::LI8:
+ case PPC::ADDItocL:
+ case PPC::ADDI:
+ case PPC::ADDI8:
+ OpNoForForwarding = i;
+ return DefMI;
+ }
}
}
}
@@ -2417,7 +2424,7 @@ const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const {
{PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR,
PPC::SPILL_CRBIT, PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX,
PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb,
- PPC::SPILLTOVSR_ST, PPC::EVSTDD, PPC::SPESTW},
+ PPC::SPILLTOVSR_ST, PPC::EVSTDD},
// Power 9
{PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR,
PPC::SPILL_CRBIT, PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32,
@@ -2433,7 +2440,7 @@ const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const {
{PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,
PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX,
PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb,
- PPC::SPILLTOVSR_LD, PPC::EVLDD, PPC::SPELWZ},
+ PPC::SPILLTOVSR_LD, PPC::EVLDD},
// Power 9
{PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,
PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, PPC::DFLOADf32,
@@ -2538,12 +2545,15 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
"The forwarding operand needs to be valid at this point");
bool IsForwardingOperandKilled = MI.getOperand(ForwardingOperand).isKill();
bool KillFwdDefMI = !SeenIntermediateUse && IsForwardingOperandKilled;
- unsigned ForwardingOperandReg = MI.getOperand(ForwardingOperand).getReg();
+ Register ForwardingOperandReg = MI.getOperand(ForwardingOperand).getReg();
if (KilledDef && KillFwdDefMI)
*KilledDef = DefMI;
ImmInstrInfo III;
- bool HasImmForm = instrHasImmForm(MI, III, PostRA);
+ bool IsVFReg = MI.getOperand(0).isReg()
+ ? isVFRegister(MI.getOperand(0).getReg())
+ : false;
+ bool HasImmForm = instrHasImmForm(MI.getOpcode(), IsVFReg, III, PostRA);
// If this is a reg+reg instruction that has a reg+imm form,
// and one of the operands is produced by an add-immediate,
// try to convert it.
@@ -2591,7 +2601,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
// If a compare-immediate is fed by an immediate and is itself an input of
// an ISEL (the most common case) into a COPY of the correct register.
bool Changed = false;
- unsigned DefReg = MI.getOperand(0).getReg();
+ Register DefReg = MI.getOperand(0).getReg();
int64_t Comparand = MI.getOperand(2).getImm();
int64_t SExtComparand = ((uint64_t)Comparand & ~0x7FFFuLL) != 0 ?
(Comparand | 0xFFFFFFFFFFFF0000) : Comparand;
@@ -2601,8 +2611,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
if (UseOpc != PPC::ISEL && UseOpc != PPC::ISEL8)
continue;
unsigned CRSubReg = CompareUseMI.getOperand(3).getSubReg();
- unsigned TrueReg = CompareUseMI.getOperand(1).getReg();
- unsigned FalseReg = CompareUseMI.getOperand(2).getReg();
+ Register TrueReg = CompareUseMI.getOperand(1).getReg();
+ Register FalseReg = CompareUseMI.getOperand(2).getReg();
unsigned RegToCopy = selectReg(SExtImm, SExtComparand, Opc, TrueReg,
FalseReg, CRSubReg);
if (RegToCopy == PPC::NoRegister)
@@ -2777,9 +2787,8 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
return false;
}
-bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
+bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg,
ImmInstrInfo &III, bool PostRA) const {
- unsigned Opc = MI.getOpcode();
// The vast majority of the instructions would need their operand 2 replaced
// with an immediate when switching to the reg+imm form. A marked exception
// are the update form loads/stores for which a constant operand 2 would need
@@ -3111,7 +3120,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
break;
case PPC::LXSSPX:
if (PostRA) {
- if (isVFRegister(MI.getOperand(0).getReg()))
+ if (IsVFReg)
III.ImmOpcode = PPC::LXSSP;
else {
III.ImmOpcode = PPC::LFS;
@@ -3125,7 +3134,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
break;
case PPC::LXSDX:
if (PostRA) {
- if (isVFRegister(MI.getOperand(0).getReg()))
+ if (IsVFReg)
III.ImmOpcode = PPC::LXSD;
else {
III.ImmOpcode = PPC::LFD;
@@ -3143,7 +3152,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
break;
case PPC::STXSSPX:
if (PostRA) {
- if (isVFRegister(MI.getOperand(0).getReg()))
+ if (IsVFReg)
III.ImmOpcode = PPC::STXSSP;
else {
III.ImmOpcode = PPC::STFS;
@@ -3157,7 +3166,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
break;
case PPC::STXSDX:
if (PostRA) {
- if (isVFRegister(MI.getOperand(0).getReg()))
+ if (IsVFReg)
III.ImmOpcode = PPC::STXSD;
else {
III.ImmOpcode = PPC::STFD;
@@ -3287,7 +3296,7 @@ bool PPCInstrInfo::isRegElgibleForForwarding(
if (MRI.isSSA())
return false;
- unsigned Reg = RegMO.getReg();
+ Register Reg = RegMO.getReg();
// Walking the inst in reverse(MI-->DefMI) to get the last DEF of the Reg.
MachineBasicBlock::const_reverse_iterator It = MI;
@@ -3511,8 +3520,8 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
if (PostRA && III.ZeroIsSpecialOrig != III.ZeroIsSpecialNew) {
unsigned PosForOrigZero = III.ZeroIsSpecialOrig ? III.ZeroIsSpecialOrig :
III.ZeroIsSpecialNew + 1;
- unsigned OrigZeroReg = MI.getOperand(PosForOrigZero).getReg();
- unsigned NewZeroReg = MI.getOperand(III.ZeroIsSpecialNew).getReg();
+ Register OrigZeroReg = MI.getOperand(PosForOrigZero).getReg();
+ Register NewZeroReg = MI.getOperand(III.ZeroIsSpecialNew).getReg();
// If R0 is in the operand where zero is special for the new instruction,
// it is unsafe to transform if the constant operand isn't that operand.
if ((NewZeroReg == PPC::R0 || NewZeroReg == PPC::X0) &&
@@ -3563,16 +3572,20 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
} else {
// The 32 bit and 64 bit instructions are quite different.
if (SpecialShift32) {
- // Left shifts use (N, 0, 31-N), right shifts use (32-N, N, 31).
- uint64_t SH = RightShift ? 32 - ShAmt : ShAmt;
+ // Left shifts use (N, 0, 31-N).
+ // Right shifts use (32-N, N, 31) if 0 < N < 32.
+ // use (0, 0, 31) if N == 0.
+ uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 32 - ShAmt : ShAmt;
uint64_t MB = RightShift ? ShAmt : 0;
uint64_t ME = RightShift ? 31 : 31 - ShAmt;
replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH);
MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(MB)
.addImm(ME);
} else {
- // Left shifts use (N, 63-N), right shifts use (64-N, N).
- uint64_t SH = RightShift ? 64 - ShAmt : ShAmt;
+ // Left shifts use (N, 63-N).
+ // Right shifts use (64-N, N) if 0 < N < 64.
+ // use (0, 0) if N == 0.
+ uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 64 - ShAmt : ShAmt;
uint64_t ME = RightShift ? ShAmt : 63 - ShAmt;
replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH);
MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(ME);
@@ -3601,8 +3614,8 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
if (III.ZeroIsSpecialNew) {
// If operand at III.ZeroIsSpecialNew is physical reg(eg: ZERO/ZERO8), no
// need to fix up register class.
- unsigned RegToModify = MI.getOperand(III.ZeroIsSpecialNew).getReg();
- if (TargetRegisterInfo::isVirtualRegister(RegToModify)) {
+ Register RegToModify = MI.getOperand(III.ZeroIsSpecialNew).getReg();
+ if (Register::isVirtualRegister(RegToModify)) {
const TargetRegisterClass *NewRC =
MRI.getRegClass(RegToModify)->hasSuperClassEq(&PPC::GPRCRegClass) ?
&PPC::GPRC_and_GPRC_NOR0RegClass : &PPC::G8RC_and_G8RC_NOX0RegClass;
@@ -3747,7 +3760,7 @@ bool PPCInstrInfo::isTOCSaveMI(const MachineInstr &MI) const {
return false;
unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
unsigned StackOffset = MI.getOperand(1).getImm();
- unsigned StackReg = MI.getOperand(2).getReg();
+ Register StackReg = MI.getOperand(2).getReg();
if (StackReg == PPC::X1 && StackOffset == TOCSaveOffset)
return true;
@@ -3772,7 +3785,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
switch (MI.getOpcode()) {
case PPC::COPY: {
- unsigned SrcReg = MI.getOperand(1).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
// In both ELFv1 and v2 ABI, method parameters and the return value
// are sign- or zero-extended.
@@ -3781,7 +3794,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
// We check the ZExt/SExt flags for a method parameter.
if (MI.getParent()->getBasicBlock() ==
&MF->getFunction().getEntryBlock()) {
- unsigned VReg = MI.getOperand(0).getReg();
+ Register VReg = MI.getOperand(0).getReg();
if (MF->getRegInfo().isLiveIn(VReg))
return SignExt ? FuncInfo->isLiveInSExt(VReg) :
FuncInfo->isLiveInZExt(VReg);
@@ -3818,7 +3831,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
}
// If this is a copy from another register, we recursively check source.
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ if (!Register::isVirtualRegister(SrcReg))
return false;
const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
if (SrcMI != NULL)
@@ -3841,8 +3854,8 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
case PPC::XORIS8: {
// logical operation with 16-bit immediate does not change the upper bits.
// So, we track the operand register as we do for register copy.
- unsigned SrcReg = MI.getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (!Register::isVirtualRegister(SrcReg))
return false;
const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
if (SrcMI != NULL)
@@ -3870,8 +3883,8 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
for (unsigned I = 1; I != E; I += D) {
if (MI.getOperand(I).isReg()) {
- unsigned SrcReg = MI.getOperand(I).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ Register SrcReg = MI.getOperand(I).getReg();
+ if (!Register::isVirtualRegister(SrcReg))
return false;
const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
if (SrcMI == NULL || !isSignOrZeroExtended(*SrcMI, SignExt, Depth+1))
@@ -3893,12 +3906,12 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
assert(MI.getOperand(1).isReg() && MI.getOperand(2).isReg());
- unsigned SrcReg1 = MI.getOperand(1).getReg();
- unsigned SrcReg2 = MI.getOperand(2).getReg();
+ Register SrcReg1 = MI.getOperand(1).getReg();
+ Register SrcReg2 = MI.getOperand(2).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg1) ||
- !TargetRegisterInfo::isVirtualRegister(SrcReg2))
- return false;
+ if (!Register::isVirtualRegister(SrcReg1) ||
+ !Register::isVirtualRegister(SrcReg2))
+ return false;
const MachineInstr *MISrc1 = MRI->getVRegDef(SrcReg1);
const MachineInstr *MISrc2 = MRI->getVRegDef(SrcReg2);
@@ -3923,21 +3936,99 @@ bool PPCInstrInfo::isBDNZ(unsigned Opcode) const {
return (Opcode == (Subtarget.isPPC64() ? PPC::BDNZ8 : PPC::BDNZ));
}
-bool PPCInstrInfo::analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
- MachineInstr *&CmpInst) const {
- MachineBasicBlock *LoopEnd = L.getBottomBlock();
- MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
- // We really "analyze" only CTR loops right now.
- if (I != LoopEnd->end() && isBDNZ(I->getOpcode())) {
- IndVarInst = nullptr;
- CmpInst = &*I;
- return false;
+namespace {
+class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
+ MachineInstr *Loop, *EndLoop, *LoopCount;
+ MachineFunction *MF;
+ const TargetInstrInfo *TII;
+ int64_t TripCount;
+
+public:
+ PPCPipelinerLoopInfo(MachineInstr *Loop, MachineInstr *EndLoop,
+ MachineInstr *LoopCount)
+ : Loop(Loop), EndLoop(EndLoop), LoopCount(LoopCount),
+ MF(Loop->getParent()->getParent()),
+ TII(MF->getSubtarget().getInstrInfo()) {
+ // Inspect the Loop instruction up-front, as it may be deleted when we call
+ // createTripCountGreaterCondition.
+ if (LoopCount->getOpcode() == PPC::LI8 || LoopCount->getOpcode() == PPC::LI)
+ TripCount = LoopCount->getOperand(1).getImm();
+ else
+ TripCount = -1;
}
- return true;
+
+ bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
+ // Only ignore the terminator.
+ return MI == EndLoop;
+ }
+
+ Optional<bool>
+ createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
+ SmallVectorImpl<MachineOperand> &Cond) override {
+ if (TripCount == -1) {
+ // Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
+ // so we don't need to generate any thing here.
+ Cond.push_back(MachineOperand::CreateImm(0));
+ Cond.push_back(MachineOperand::CreateReg(
+ MF->getSubtarget<PPCSubtarget>().isPPC64() ? PPC::CTR8 : PPC::CTR,
+ true));
+ return {};
+ }
+
+ return TripCount > TC;
+ }
+
+ void setPreheader(MachineBasicBlock *NewPreheader) override {
+ // Do nothing. We want the LOOP setup instruction to stay in the *old*
+ // preheader, so we can use BDZ in the prologs to adapt the loop trip count.
+ }
+
+ void adjustTripCount(int TripCountAdjust) override {
+ // If the loop trip count is a compile-time value, then just change the
+ // value.
+ if (LoopCount->getOpcode() == PPC::LI8 ||
+ LoopCount->getOpcode() == PPC::LI) {
+ int64_t TripCount = LoopCount->getOperand(1).getImm() + TripCountAdjust;
+ LoopCount->getOperand(1).setImm(TripCount);
+ return;
+ }
+
+ // Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
+ // so we don't need to generate any thing here.
+ }
+
+ void disposed() override {
+ Loop->eraseFromParent();
+ // Ensure the loop setup instruction is deleted too.
+ LoopCount->eraseFromParent();
+ }
+};
+} // namespace
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+PPCInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+ // We really "analyze" only hardware loops right now.
+ MachineBasicBlock::iterator I = LoopBB->getFirstTerminator();
+ MachineBasicBlock *Preheader = *LoopBB->pred_begin();
+ if (Preheader == LoopBB)
+ Preheader = *std::next(LoopBB->pred_begin());
+ MachineFunction *MF = Preheader->getParent();
+
+ if (I != LoopBB->end() && isBDNZ(I->getOpcode())) {
+ SmallPtrSet<MachineBasicBlock *, 8> Visited;
+ if (MachineInstr *LoopInst = findLoopInstr(*Preheader, Visited)) {
+ Register LoopCountReg = LoopInst->getOperand(0).getReg();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg);
+ return std::make_unique<PPCPipelinerLoopInfo>(LoopInst, &*I, LoopCount);
+ }
+ }
+ return nullptr;
}
-MachineInstr *
-PPCInstrInfo::findLoopInstr(MachineBasicBlock &PreHeader) const {
+MachineInstr *PPCInstrInfo::findLoopInstr(
+ MachineBasicBlock &PreHeader,
+ SmallPtrSet<MachineBasicBlock *, 8> &Visited) const {
unsigned LOOPi = (Subtarget.isPPC64() ? PPC::MTCTR8loop : PPC::MTCTRloop);
@@ -3948,50 +4039,6 @@ PPCInstrInfo::findLoopInstr(MachineBasicBlock &PreHeader) const {
return nullptr;
}
-unsigned PPCInstrInfo::reduceLoopCount(
- MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
- MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
- SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
- unsigned MaxIter) const {
- // We expect a hardware loop currently. This means that IndVar is set
- // to null, and the compare is the ENDLOOP instruction.
- assert((!IndVar) && isBDNZ(Cmp.getOpcode()) && "Expecting a CTR loop");
- MachineFunction *MF = MBB.getParent();
- DebugLoc DL = Cmp.getDebugLoc();
- MachineInstr *Loop = findLoopInstr(PreHeader);
- if (!Loop)
- return 0;
- unsigned LoopCountReg = Loop->getOperand(0).getReg();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg);
-
- if (!LoopCount)
- return 0;
- // If the loop trip count is a compile-time value, then just change the
- // value.
- if (LoopCount->getOpcode() == PPC::LI8 || LoopCount->getOpcode() == PPC::LI) {
- int64_t Offset = LoopCount->getOperand(1).getImm();
- if (Offset <= 1) {
- LoopCount->eraseFromParent();
- Loop->eraseFromParent();
- return 0;
- }
- LoopCount->getOperand(1).setImm(Offset - 1);
- return Offset - 1;
- }
-
- // The loop trip count is a run-time value.
- // We need to subtract one from the trip count,
- // and insert branch later to check if we're done with the loop.
-
- // Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
- // so we don't need to generate any thing here.
- Cond.push_back(MachineOperand::CreateImm(0));
- Cond.push_back(MachineOperand::CreateReg(
- Subtarget.isPPC64() ? PPC::CTR8 : PPC::CTR, true));
- return LoopCountReg;
-}
-
// Return true if get the base operand, byte offset of an instruction and the
// memory width. Width is the size of memory that is being loaded/stored.
bool PPCInstrInfo::getMemOperandWithOffsetWidth(
@@ -4018,8 +4065,7 @@ bool PPCInstrInfo::getMemOperandWithOffsetWidth(
}
bool PPCInstrInfo::areMemAccessesTriviallyDisjoint(
- const MachineInstr &MIa, const MachineInstr &MIb,
- AliasAnalysis * /*AA*/) const {
+ const MachineInstr &MIa, const MachineInstr &MIb) const {
assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 70fb757e8f1e..19ab30cb0908 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -248,11 +248,11 @@ public:
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AliasAnalysis *AA) const override;
+ AAResults *AA) const override;
unsigned isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
- bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+ bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
void insertNoop(MachineBasicBlock &MBB,
@@ -370,8 +370,7 @@ public:
/// otherwise
bool
areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
- const MachineInstr &MIb,
- AliasAnalysis *AA = nullptr) const override;
+ const MachineInstr &MIb) const override;
/// GetInstSize - Return the number of bytes of code the specified
/// instruction may be. This returns the maximum number of bytes.
@@ -439,9 +438,14 @@ public:
void replaceInstrOperandWithImm(MachineInstr &MI, unsigned OpNo,
int64_t Imm) const;
- bool instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III,
+ bool instrHasImmForm(unsigned Opc, bool IsVFReg, ImmInstrInfo &III,
bool PostRA) const;
+ // In PostRA phase, try to find instruction defines \p Reg before \p MI.
+ // \p SeenIntermediate is set to true if uses between DefMI and \p MI exist.
+ MachineInstr *getDefMIPostRA(unsigned Reg, MachineInstr &MI,
+ bool &SeenIntermediateUse) const;
+
/// getRegNumForOperand - some operands use different numbering schemes
/// for the same registers. For example, a VSX instruction may have any of
/// vs0-vs63 allocated whereas an Altivec instruction could only have
@@ -481,26 +485,14 @@ public:
/// On PPC, we have two instructions used to set-up the hardware loop
/// (MTCTRloop, MTCTR8loop) with corresponding endloop (BDNZ, BDNZ8)
/// instructions to indicate the end of a loop.
- MachineInstr *findLoopInstr(MachineBasicBlock &PreHeader) const;
-
- /// Analyze the loop code to find the loop induction variable and compare used
- /// to compute the number of iterations. Currently, we analyze loop that are
- /// controlled using hardware loops. In this case, the induction variable
- /// instruction is null. For all other cases, this function returns true,
- /// which means we're unable to analyze it. \p IndVarInst and \p CmpInst will
- /// return new values when we can analyze the readonly loop \p L, otherwise,
- /// nothing got changed
- bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
- MachineInstr *&CmpInst) const override;
- /// Generate code to reduce the loop iteration by one and check if the loop
- /// is finished. Return the value/register of the new loop count. We need
- /// this function when peeling off one or more iterations of a loop. This
- /// function assumes the last iteration is peeled first.
- unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
- MachineInstr *IndVar, MachineInstr &Cmp,
- SmallVectorImpl<MachineOperand> &Cond,
- SmallVectorImpl<MachineInstr *> &PrevInsts,
- unsigned Iter, unsigned MaxIter) const override;
+ MachineInstr *
+ findLoopInstr(MachineBasicBlock &PreHeader,
+ SmallPtrSet<MachineBasicBlock *, 8> &Visited) const;
+
+ /// Analyze loop L, which must be a single-basic-block loop, and if the
+ /// conditions can be understood enough produce a PipelinerLoopInfo object.
+ std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+ analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
};
}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index c313337047f0..24183277519b 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -386,7 +386,9 @@ def immZExt16 : PatLeaf<(imm), [{
// field. Used by instructions like 'ori'.
return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
}], LO16>;
-def immAnyExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm) || isUInt<8>(Imm); }]>;
+def immNonAllOneAnyExt8 : ImmLeaf<i32, [{
+ return (isInt<8>(Imm) && (Imm != -1)) || (isUInt<8>(Imm) && (Imm != 0xFF));
+}]>;
def immSExt5NonZero : ImmLeaf<i32, [{ return Imm && isInt<5>(Imm); }]>;
// imm16Shifted* - These match immediates where the low 16-bits are zero. There
@@ -577,7 +579,7 @@ def sperc : RegisterOperand<SPERC> {
def PPCRegSPE4RCAsmOperand : AsmOperandClass {
let Name = "RegSPE4RC"; let PredicateMethod = "isRegNumber";
}
-def spe4rc : RegisterOperand<SPE4RC> {
+def spe4rc : RegisterOperand<GPRC> {
let ParserMatchClass = PPCRegSPE4RCAsmOperand;
}
@@ -3161,7 +3163,16 @@ def ADDISdtprelHA32 : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s1
def LWZtoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
"#LWZtoc",
[(set i32:$rD,
+ (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
+def LWZtocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc_nor0:$reg),
+ "#LWZtocL",
+ [(set i32:$rD,
(PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
+def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp),
+ "#ADDIStocHA",
+ [(set i32:$rD,
+ (PPCtoc_entry i32:$reg, tglobaladdr:$disp))]>;
+
// Get Global (GOT) Base Register offset, from the word immediately preceding
// the function label.
def UpdateGBR : PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>;
@@ -3177,21 +3188,21 @@ def : Pat<(srl i32:$rS, i32:$rB),
def : Pat<(shl i32:$rS, i32:$rB),
(SLW $rS, $rB)>;
-def : Pat<(zextloadi1 iaddr:$src),
+def : Pat<(i32 (zextloadi1 iaddr:$src)),
(LBZ iaddr:$src)>;
-def : Pat<(zextloadi1 xaddr:$src),
+def : Pat<(i32 (zextloadi1 xaddr:$src)),
(LBZX xaddr:$src)>;
-def : Pat<(extloadi1 iaddr:$src),
+def : Pat<(i32 (extloadi1 iaddr:$src)),
(LBZ iaddr:$src)>;
-def : Pat<(extloadi1 xaddr:$src),
+def : Pat<(i32 (extloadi1 xaddr:$src)),
(LBZX xaddr:$src)>;
-def : Pat<(extloadi8 iaddr:$src),
+def : Pat<(i32 (extloadi8 iaddr:$src)),
(LBZ iaddr:$src)>;
-def : Pat<(extloadi8 xaddr:$src),
+def : Pat<(i32 (extloadi8 xaddr:$src)),
(LBZX xaddr:$src)>;
-def : Pat<(extloadi16 iaddr:$src),
+def : Pat<(i32 (extloadi16 iaddr:$src)),
(LHZ iaddr:$src)>;
-def : Pat<(extloadi16 xaddr:$src),
+def : Pat<(i32 (extloadi16 xaddr:$src)),
(LHZX xaddr:$src)>;
let Predicates = [HasFPU] in {
def : Pat<(f64 (extloadf32 iaddr:$src)),
@@ -3564,23 +3575,6 @@ def : Pat<(i1 (setcc i32:$s1, imm:$imm, SETEQ)),
(EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
(LO16 imm:$imm)), sub_eq)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)),
- (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)),
- (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)),
- (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)),
- (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)),
- (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)),
- (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
-
-defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
- (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
- (LO16 imm:$imm)), sub_eq)>;
-
def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETULT)),
(EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETLT)),
@@ -3592,17 +3586,6 @@ def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETGT)),
def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETEQ)),
(EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)),
- (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)),
- (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)),
- (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)),
- (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)),
- (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
-
// SETCC for i64.
def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULT)),
(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
@@ -3632,6 +3615,47 @@ def : Pat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETEQ)),
(EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
(LO16 imm:$imm)), sub_eq)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)),
+ (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)),
+ (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)),
+ (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)),
+ (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)),
+ (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+
+// Instantiations of CRNotPat for i32.
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)),
+ (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)),
+ (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)),
+ (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)),
+ (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)),
+ (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)),
+ (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
+ (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+ (LO16 imm:$imm)), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)),
+ (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)),
+ (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)),
+ (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)),
+ (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)),
+ (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
+
+// Instantiations of CRNotPat for i64.
defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGE)),
(EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGE)),
@@ -3649,17 +3673,6 @@ defm : CRNotPat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)),
(EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
(LO16 imm:$imm)), sub_eq)>;
-def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)),
- (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)),
- (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
-def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)),
- (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)),
- (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
-def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)),
- (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
-
defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETUGE)),
(EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETGE)),
@@ -3671,6 +3684,56 @@ defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)),
defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
(EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+let Predicates = [HasFPU] in {
+// Instantiations of CRNotPat for f32.
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)),
+ (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
+
+// Instantiations of CRNotPat for f64.
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
+ (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+
+// Instantiations of CRNotPat for f128.
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)),
+ (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
+}
+
// SETCC for f32.
let Predicates = [HasFPU] in {
def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
@@ -3688,21 +3751,6 @@ def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)),
(EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)),
- (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
-
// SETCC for f64.
def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
@@ -3719,21 +3767,6 @@ def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)),
(EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
- (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
-
// SETCC for f128.
def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOLT)),
(EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
@@ -3750,21 +3783,6 @@ def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETEQ)),
def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETUO)),
(EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>;
-defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)),
- (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>;
-
}
// This must be in this file because it relies on patterns defined in this file
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 07f38a61d098..2aad5860d87f 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -58,8 +58,12 @@ def SDT_PPCldvsxlh : SDTypeProfile<1, 1, [
SDTCisVT<0, v4f32>, SDTCisPtrTy<1>
]>;
-def SDT_PPCfpextlh : SDTypeProfile<1, 1, [
- SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>
+def SDT_PPCfpexth : SDTypeProfile<1, 2, [
+ SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>, SDTCisPtrTy<2>
+]>;
+
+def SDT_PPCldsplat : SDTypeProfile<1, 1, [
+ SDTCisVec<0>, SDTCisPtrTy<1>
]>;
// Little-endian-specific nodes.
@@ -78,12 +82,21 @@ def SDTVecConv : SDTypeProfile<1, 2, [
def SDTVabsd : SDTypeProfile<1, 3, [
SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32>
]>;
-
+def SDT_PPCld_vec_be : SDTypeProfile<1, 1, [
+ SDTCisVec<0>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCst_vec_be : SDTypeProfile<0, 2, [
+ SDTCisVec<0>, SDTCisPtrTy<1>
+]>;
def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
[SDNPHasChain, SDNPMayStore]>;
+def PPCld_vec_be : SDNode<"PPCISD::LOAD_VEC_BE", SDT_PPCld_vec_be,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def PPCst_vec_be : SDNode<"PPCISD::STORE_VEC_BE", SDT_PPCst_vec_be,
+ [SDNPHasChain, SDNPMayStore]>;
def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
@@ -93,9 +106,11 @@ def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>;
-def PPCfpextlh : SDNode<"PPCISD::FP_EXTEND_LH", SDT_PPCfpextlh, []>;
+def PPCfpexth : SDNode<"PPCISD::FP_EXTEND_HALF", SDT_PPCfpexth, []>;
def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
string asmstr, InstrItinClass itin, Intrinsic Int,
@@ -855,14 +870,14 @@ let Uses = [RM] in {
let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
isReMaterializable = 1 in {
- def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins),
+ def XXLXORz : XX3Form_SameOp<60, 154, (outs vsrc:$XT), (ins),
"xxlxor $XT, $XT, $XT", IIC_VecGeneral,
[(set v4i32:$XT, (v4i32 immAllZerosV))]>;
- def XXLXORdpz : XX3Form_SetZero<60, 154,
+ def XXLXORdpz : XX3Form_SameOp<60, 154,
(outs vsfrc:$XT), (ins),
"xxlxor $XT, $XT, $XT", IIC_VecGeneral,
[(set f64:$XT, (fpimm0))]>;
- def XXLXORspz : XX3Form_SetZero<60, 154,
+ def XXLXORspz : XX3Form_SameOp<60, 154,
(outs vssrc:$XT), (ins),
"xxlxor $XT, $XT, $XT", IIC_VecGeneral,
[(set f32:$XT, (fpimm0))]>;
@@ -996,21 +1011,21 @@ def : Pat<(f64 (extractelt v2f64:$S, 1)),
(f64 (EXTRACT_SUBREG $S, sub_64))>;
}
-// Additional fnmsub patterns: -a*c + b == -(a*c - b)
-def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
- (XSNMSUBADP $B, $C, $A)>;
-def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
- (XSNMSUBADP $B, $C, $A)>;
+// Additional fnmsub patterns: -a*b + c == -(a*b - c)
+def : Pat<(fma (fneg f64:$A), f64:$B, f64:$C),
+ (XSNMSUBADP $C, $A, $B)>;
+def : Pat<(fma f64:$A, (fneg f64:$B), f64:$C),
+ (XSNMSUBADP $C, $A, $B)>;
-def : Pat<(fma (fneg v2f64:$A), v2f64:$C, v2f64:$B),
- (XVNMSUBADP $B, $C, $A)>;
-def : Pat<(fma v2f64:$A, (fneg v2f64:$C), v2f64:$B),
- (XVNMSUBADP $B, $C, $A)>;
+def : Pat<(fma (fneg v2f64:$A), v2f64:$B, v2f64:$C),
+ (XVNMSUBADP $C, $A, $B)>;
+def : Pat<(fma v2f64:$A, (fneg v2f64:$B), v2f64:$C),
+ (XVNMSUBADP $C, $A, $B)>;
-def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B),
- (XVNMSUBASP $B, $C, $A)>;
-def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B),
- (XVNMSUBASP $B, $C, $A)>;
+def : Pat<(fma (fneg v4f32:$A), v4f32:$B, v4f32:$C),
+ (XVNMSUBASP $C, $A, $B)>;
+def : Pat<(fma v4f32:$A, (fneg v4f32:$B), v4f32:$C),
+ (XVNMSUBASP $C, $A, $B)>;
def : Pat<(v2f64 (bitconvert v4f32:$A)),
(COPY_TO_REGCLASS $A, VSRC)>;
@@ -1077,7 +1092,8 @@ def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)),
def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
(v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
-def : Pat<(v2f64 (PPCfpextlh v4f32:$C)), (XVCVSPDP (XXMRGHW $C, $C))>;
+def : Pat<(v2f64 (PPCfpexth v4f32:$C, 0)), (XVCVSPDP (XXMRGHW $C, $C))>;
+def : Pat<(v2f64 (PPCfpexth v4f32:$C, 1)), (XVCVSPDP (XXMRGLW $C, $C))>;
// Loads.
let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
@@ -1088,6 +1104,19 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
(STXVD2X $rS, xoaddr:$dst)>;
def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
}
+
+// Load vector big endian order
+let Predicates = [IsLittleEndian, HasVSX] in {
+ def : Pat<(v2f64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+ def : Pat<(PPCst_vec_be v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+ def : Pat<(v4f32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+ def : Pat<(PPCst_vec_be v4f32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
+ def : Pat<(v2i64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+ def : Pat<(PPCst_vec_be v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+ def : Pat<(v4i32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+ def : Pat<(PPCst_vec_be v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
+}
+
let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in {
def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
@@ -1288,6 +1317,13 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B),
(XXLEQV $A, $B)>;
+ let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
+ isReMaterializable = 1 in {
+ def XXLEQVOnes : XX3Form_SameOp<60, 186, (outs vsrc:$XT), (ins),
+ "xxleqv $XT, $XT, $XT", IIC_VecGeneral,
+ [(set v4i32:$XT, (bitconvert (v16i8 immAllOnesV)))]>;
+ }
+
def XXLORC : XX3Form<60, 170,
(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
"xxlorc $XT, $XA, $XB", IIC_VecGeneral,
@@ -1476,6 +1512,12 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
AltVSXFMARel;
}
+ // Additional xsnmsubasp patterns: -a*b + c == -(a*b - c)
+ def : Pat<(fma (fneg f32:$A), f32:$B, f32:$C),
+ (XSNMSUBASP $C, $A, $B)>;
+ def : Pat<(fma f32:$A, (fneg f32:$B), f32:$C),
+ (XSNMSUBASP $C, $A, $B)>;
+
// Single Precision Conversions (FP <-> INT)
def XSCVSXDSP : XX2Form<60, 312,
(outs vssrc:$XT), (ins vsfrc:$XB),
@@ -1564,16 +1606,33 @@ let Predicates = [HasDirectMove] in {
def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT),
"mfvsrwz $rA, $XT", IIC_VecGeneral,
[(set i32:$rA, (PPCmfvsr f64:$XT))]>;
+ let isCodeGenOnly = 1 in
+ def MFVRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsrc:$XT),
+ "mfvsrwz $rA, $XT", IIC_VecGeneral,
+ []>;
def MTVSRD : XX1_RS6_RD5_XO<31, 179, (outs vsfrc:$XT), (ins g8rc:$rA),
"mtvsrd $XT, $rA", IIC_VecGeneral,
[(set f64:$XT, (PPCmtvsra i64:$rA))]>,
Requires<[In64BitMode]>;
+ let isCodeGenOnly = 1 in
+ def MTVRD : XX1_RS6_RD5_XO<31, 179, (outs vsrc:$XT), (ins g8rc:$rA),
+ "mtvsrd $XT, $rA", IIC_VecGeneral,
+ []>,
+ Requires<[In64BitMode]>;
def MTVSRWA : XX1_RS6_RD5_XO<31, 211, (outs vsfrc:$XT), (ins gprc:$rA),
"mtvsrwa $XT, $rA", IIC_VecGeneral,
[(set f64:$XT, (PPCmtvsra i32:$rA))]>;
+ let isCodeGenOnly = 1 in
+ def MTVRWA : XX1_RS6_RD5_XO<31, 211, (outs vsrc:$XT), (ins gprc:$rA),
+ "mtvsrwa $XT, $rA", IIC_VecGeneral,
+ []>;
def MTVSRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsfrc:$XT), (ins gprc:$rA),
"mtvsrwz $XT, $rA", IIC_VecGeneral,
[(set f64:$XT, (PPCmtvsrz i32:$rA))]>;
+ let isCodeGenOnly = 1 in
+ def MTVRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsrc:$XT), (ins gprc:$rA),
+ "mtvsrwz $XT, $rA", IIC_VecGeneral,
+ []>;
} // HasDirectMove
let Predicates = [IsISA3_0, HasDirectMove] in {
@@ -1597,6 +1656,22 @@ def : InstAlias<"mfvrd $rA, $XT",
(MFVRD g8rc:$rA, vrrc:$XT), 0>;
def : InstAlias<"mffprd $rA, $src",
(MFVSRD g8rc:$rA, f8rc:$src)>;
+def : InstAlias<"mtvrd $XT, $rA",
+ (MTVRD vrrc:$XT, g8rc:$rA), 0>;
+def : InstAlias<"mtfprd $dst, $rA",
+ (MTVSRD f8rc:$dst, g8rc:$rA)>;
+def : InstAlias<"mfvrwz $rA, $XT",
+ (MFVRWZ gprc:$rA, vrrc:$XT), 0>;
+def : InstAlias<"mffprwz $rA, $src",
+ (MFVSRWZ gprc:$rA, f8rc:$src)>;
+def : InstAlias<"mtvrwa $XT, $rA",
+ (MTVRWA vrrc:$XT, gprc:$rA), 0>;
+def : InstAlias<"mtfprwa $dst, $rA",
+ (MTVSRWA f8rc:$dst, gprc:$rA)>;
+def : InstAlias<"mtvrwz $XT, $rA",
+ (MTVRWZ vrrc:$XT, gprc:$rA), 0>;
+def : InstAlias<"mtfprwz $dst, $rA",
+ (MTVSRWZ f8rc:$dst, gprc:$rA)>;
/* Direct moves of various widths from GPR's into VSR's. Each move lines
the value up into element 0 (both BE and LE). Namely, entities smaller than
@@ -2581,9 +2656,9 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(fneg (int_ppc_fmaf128_round_to_odd
f128:$vA, f128:$vB, (fneg f128:$vTi))))]>;
- // Additional fnmsub patterns: -a*c + b == -(a*c - b)
- def : Pat<(fma (fneg f128:$A), f128:$C, f128:$B), (XSNMSUBQP $B, $C, $A)>;
- def : Pat<(fma f128:$A, (fneg f128:$C), f128:$B), (XSNMSUBQP $B, $C, $A)>;
+ // Additional fnmsub patterns: -a*b + c == -(a*b - c)
+ def : Pat<(fma (fneg f128:$A), f128:$B, f128:$C), (XSNMSUBQP $C, $A, $B)>;
+ def : Pat<(fma f128:$A, (fneg f128:$B), f128:$C), (XSNMSUBQP $C, $A, $B)>;
//===--------------------------------------------------------------------===//
// Quad/Double-Precision Compare Instructions:
@@ -2799,12 +2874,12 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
"xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP,
[(set v4i32: $XT,
- (int_ppc_vsx_xvtstdcsp v4f32:$XB, imm:$DCMX))]>;
+ (int_ppc_vsx_xvtstdcsp v4f32:$XB, timm:$DCMX))]>;
def XVTSTDCDP : XX2_RD6_DCMX7_RS6<60, 15, 5,
(outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
"xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP,
[(set v2i64: $XT,
- (int_ppc_vsx_xvtstdcdp v2f64:$XB, imm:$DCMX))]>;
+ (int_ppc_vsx_xvtstdcdp v2f64:$XB, timm:$DCMX))]>;
//===--------------------------------------------------------------------===//
@@ -3024,6 +3099,16 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
+
+ def : Pat<(v8i16 (PPCld_vec_be xoaddr:$src)),
+ (COPY_TO_REGCLASS (LXVH8X xoaddr:$src), VRRC)>;
+ def : Pat<(PPCst_vec_be v8i16:$rS, xoaddr:$dst),
+ (STXVH8X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
+
+ def : Pat<(v16i8 (PPCld_vec_be xoaddr:$src)),
+ (COPY_TO_REGCLASS (LXVB16X xoaddr:$src), VRRC)>;
+ def : Pat<(PPCst_vec_be v16i8:$rS, xoaddr:$dst),
+ (STXVB16X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
} // IsLittleEndian, HasP9Vector
let Predicates = [IsBigEndian, HasP9Vector] in {
@@ -3059,7 +3144,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
- } // IsLittleEndian, HasP9Vector
+ } // IsBigEndian, HasP9Vector
// D-Form Load/Store
def : Pat<(v4i32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
@@ -3858,6 +3943,10 @@ let AddedComplexity = 400 in {
(XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>;
def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
(v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
+ def : Pat<(v2f64 (PPCldsplat xoaddr:$A)),
+ (v2f64 (LXVDSX xoaddr:$A))>;
+ def : Pat<(v2i64 (PPCldsplat xoaddr:$A)),
+ (v2i64 (LXVDSX xoaddr:$A))>;
// Build vectors of floating point converted to i64.
def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
@@ -4063,27 +4152,32 @@ let AddedComplexity = 400 in {
(XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
}
+ let Predicates = [HasP8Vector] in {
+ def : Pat<(v1i128 (bitconvert (v16i8 immAllOnesV))),
+ (v1i128 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+ def : Pat<(v2i64 (bitconvert (v16i8 immAllOnesV))),
+ (v2i64 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+ def : Pat<(v8i16 (bitconvert (v16i8 immAllOnesV))),
+ (v8i16 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+ def : Pat<(v16i8 (bitconvert (v16i8 immAllOnesV))),
+ (v16i8 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+ }
+
let Predicates = [HasP9Vector] in {
// Endianness-neutral patterns for const splats with ISA 3.0 instructions.
def : Pat<(v4i32 (scalar_to_vector i32:$A)),
(v4i32 (MTVSRWS $A))>;
def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
(v4i32 (MTVSRWS $A))>;
- def : Pat<(v16i8 (build_vector immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
- immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
- immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
- immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
- immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
- immAnyExt8:$A)),
+ def : Pat<(v16i8 (build_vector immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+ immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+ immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+ immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+ immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+ immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+ immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+ immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A)),
(v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>;
- def : Pat<(v16i8 immAllOnesV),
- (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>;
- def : Pat<(v8i16 immAllOnesV),
- (v8i16 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>;
- def : Pat<(v4i32 immAllOnesV),
- (v4i32 (XXSPLTIB 255))>;
- def : Pat<(v2i64 immAllOnesV),
- (v2i64 (XXSPLTIB 255))>;
def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)),
(v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>;
def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
@@ -4102,6 +4196,10 @@ let AddedComplexity = 400 in {
(v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
(DFLOADf32 iaddrX4:$A),
VSFRC)), 0))>;
+ def : Pat<(v4f32 (PPCldsplat xoaddr:$A)),
+ (v4f32 (LXVWSX xoaddr:$A))>;
+ def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
+ (v4i32 (LXVWSX xoaddr:$A))>;
}
let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in {
diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index 4d45d96d4479..d252cfbd26b1 100644
--- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -63,8 +63,24 @@ static cl::opt<unsigned> MaxVars("ppc-preinc-prep-max-vars",
cl::desc("Potential PHI threshold for PPC preinc loop prep"));
STATISTIC(PHINodeAlreadyExists, "PHI node already in pre-increment form");
+STATISTIC(UpdFormChainRewritten, "Num of update form chain rewritten");
namespace {
+ struct BucketElement {
+ BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {}
+ BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {}
+
+ const SCEVConstant *Offset;
+ Instruction *Instr;
+ };
+
+ struct Bucket {
+ Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B),
+ Elements(1, BucketElement(I)) {}
+
+ const SCEV *BaseSCEV;
+ SmallVector<BucketElement, 16> Elements;
+ };
class PPCLoopPreIncPrep : public FunctionPass {
public:
@@ -85,21 +101,47 @@ namespace {
AU.addRequired<ScalarEvolutionWrapperPass>();
}
- bool alreadyPrepared(Loop *L, Instruction* MemI,
- const SCEV *BasePtrStartSCEV,
- const SCEVConstant *BasePtrIncSCEV);
bool runOnFunction(Function &F) override;
- bool runOnLoop(Loop *L);
- void simplifyLoopLatch(Loop *L);
- bool rotateLoop(Loop *L);
-
private:
PPCTargetMachine *TM = nullptr;
+ const PPCSubtarget *ST;
DominatorTree *DT;
LoopInfo *LI;
ScalarEvolution *SE;
bool PreserveLCSSA;
+
+ bool runOnLoop(Loop *L);
+
+ /// Check if required PHI node is already exist in Loop \p L.
+ bool alreadyPrepared(Loop *L, Instruction* MemI,
+ const SCEV *BasePtrStartSCEV,
+ const SCEVConstant *BasePtrIncSCEV);
+
+ /// Collect condition matched(\p isValidCandidate() returns true)
+ /// candidates in Loop \p L.
+ SmallVector<Bucket, 16>
+ collectCandidates(Loop *L,
+ std::function<bool(const Instruction *, const Value *)>
+ isValidCandidate,
+ unsigned MaxCandidateNum);
+
+ /// Add a candidate to candidates \p Buckets.
+ void addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
+ SmallVector<Bucket, 16> &Buckets,
+ unsigned MaxCandidateNum);
+
+ /// Prepare all candidates in \p Buckets for update form.
+ bool updateFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets);
+
+ /// Prepare for one chain \p BucketChain, find the best base element and
+ /// update all other elements in \p BucketChain accordingly.
+ bool prepareBaseForUpdateFormChain(Bucket &BucketChain);
+
+ /// Rewrite load/store instructions in \p BucketChain according to
+ /// preparation.
+ bool rewriteLoadStores(Loop *L, Bucket &BucketChain,
+ SmallSet<BasicBlock *, 16> &BBChanged);
};
} // end anonymous namespace
@@ -111,30 +153,15 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
+static const std::string PHINodeNameSuffix = ".phi";
+static const std::string CastNodeNameSuffix = ".cast";
+static const std::string GEPNodeIncNameSuffix = ".inc";
+static const std::string GEPNodeOffNameSuffix = ".off";
+
FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) {
return new PPCLoopPreIncPrep(TM);
}
-namespace {
-
- struct BucketElement {
- BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {}
- BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {}
-
- const SCEVConstant *Offset;
- Instruction *Instr;
- };
-
- struct Bucket {
- Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B),
- Elements(1, BucketElement(I)) {}
-
- const SCEV *BaseSCEV;
- SmallVector<BucketElement, 16> Elements;
- };
-
-} // end anonymous namespace
-
static bool IsPtrInBounds(Value *BasePtr) {
Value *StrippedBasePtr = BasePtr;
while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBasePtr))
@@ -145,6 +172,14 @@ static bool IsPtrInBounds(Value *BasePtr) {
return false;
}
+static std::string getInstrName(const Value *I, const std::string Suffix) {
+ assert(I && "Invalid paramater!");
+ if (I->hasName())
+ return (I->getName() + Suffix).str();
+ else
+ return "";
+}
+
static Value *GetPointerOperand(Value *MemI) {
if (LoadInst *LMemI = dyn_cast<LoadInst>(MemI)) {
return LMemI->getPointerOperand();
@@ -167,6 +202,7 @@ bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
DT = DTWP ? &DTWP->getDomTree() : nullptr;
PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+ ST = TM ? TM->getSubtargetImpl(F) : nullptr;
bool MadeChange = false;
@@ -177,10 +213,280 @@ bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
return MadeChange;
}
+void PPCLoopPreIncPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
+ SmallVector<Bucket, 16> &Buckets,
+ unsigned MaxCandidateNum) {
+ assert((MemI && GetPointerOperand(MemI)) &&
+ "Candidate should be a memory instruction.");
+ assert(LSCEV && "Invalid SCEV for Ptr value.");
+ bool FoundBucket = false;
+ for (auto &B : Buckets) {
+ const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
+ if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) {
+ B.Elements.push_back(BucketElement(CDiff, MemI));
+ FoundBucket = true;
+ break;
+ }
+ }
+
+ if (!FoundBucket) {
+ if (Buckets.size() == MaxCandidateNum)
+ return;
+ Buckets.push_back(Bucket(LSCEV, MemI));
+ }
+}
+
+SmallVector<Bucket, 16> PPCLoopPreIncPrep::collectCandidates(
+ Loop *L,
+ std::function<bool(const Instruction *, const Value *)> isValidCandidate,
+ unsigned MaxCandidateNum) {
+ SmallVector<Bucket, 16> Buckets;
+ for (const auto &BB : L->blocks())
+ for (auto &J : *BB) {
+ Value *PtrValue;
+ Instruction *MemI;
+
+ if (LoadInst *LMemI = dyn_cast<LoadInst>(&J)) {
+ MemI = LMemI;
+ PtrValue = LMemI->getPointerOperand();
+ } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&J)) {
+ MemI = SMemI;
+ PtrValue = SMemI->getPointerOperand();
+ } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(&J)) {
+ if (IMemI->getIntrinsicID() == Intrinsic::prefetch) {
+ MemI = IMemI;
+ PtrValue = IMemI->getArgOperand(0);
+ } else continue;
+ } else continue;
+
+ unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+ if (PtrAddrSpace)
+ continue;
+
+ if (L->isLoopInvariant(PtrValue))
+ continue;
+
+ const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L);
+ const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV);
+ if (!LARSCEV || LARSCEV->getLoop() != L)
+ continue;
+
+ if (isValidCandidate(&J, PtrValue))
+ addOneCandidate(MemI, LSCEV, Buckets, MaxCandidateNum);
+ }
+ return Buckets;
+}
+
+// TODO: implement a more clever base choosing policy.
+// Currently we always choose an exist load/store offset. This maybe lead to
+// suboptimal code sequences. For example, for one DS chain with offsets
+// {-32769, 2003, 2007, 2011}, we choose -32769 as base offset, and left disp
+// for load/stores are {0, 34772, 34776, 34780}. Though each offset now is a
+// multipler of 4, it cannot be represented by sint16.
+bool PPCLoopPreIncPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) {
+ // We have a choice now of which instruction's memory operand we use as the
+ // base for the generated PHI. Always picking the first instruction in each
+ // bucket does not work well, specifically because that instruction might
+ // be a prefetch (and there are no pre-increment dcbt variants). Otherwise,
+ // the choice is somewhat arbitrary, because the backend will happily
+ // generate direct offsets from both the pre-incremented and
+ // post-incremented pointer values. Thus, we'll pick the first non-prefetch
+ // instruction in each bucket, and adjust the recurrence and other offsets
+ // accordingly.
+ for (int j = 0, je = BucketChain.Elements.size(); j != je; ++j) {
+ if (auto *II = dyn_cast<IntrinsicInst>(BucketChain.Elements[j].Instr))
+ if (II->getIntrinsicID() == Intrinsic::prefetch)
+ continue;
+
+ // If we'd otherwise pick the first element anyway, there's nothing to do.
+ if (j == 0)
+ break;
+
+ // If our chosen element has no offset from the base pointer, there's
+ // nothing to do.
+ if (!BucketChain.Elements[j].Offset ||
+ BucketChain.Elements[j].Offset->isZero())
+ break;
+
+ const SCEV *Offset = BucketChain.Elements[j].Offset;
+ BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset);
+ for (auto &E : BucketChain.Elements) {
+ if (E.Offset)
+ E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
+ else
+ E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
+ }
+
+ std::swap(BucketChain.Elements[j], BucketChain.Elements[0]);
+ break;
+ }
+ return true;
+}
+
+bool PPCLoopPreIncPrep::rewriteLoadStores(
+ Loop *L, Bucket &BucketChain, SmallSet<BasicBlock *, 16> &BBChanged) {
+ bool MadeChange = false;
+ const SCEVAddRecExpr *BasePtrSCEV =
+ cast<SCEVAddRecExpr>(BucketChain.BaseSCEV);
+ if (!BasePtrSCEV->isAffine())
+ return MadeChange;
+
+ LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
+
+ assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?");
+
+ // The instruction corresponding to the Bucket's BaseSCEV must be the first
+ // in the vector of elements.
+ Instruction *MemI = BucketChain.Elements.begin()->Instr;
+ Value *BasePtr = GetPointerOperand(MemI);
+ assert(BasePtr && "No pointer operand");
+
+ Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext());
+ Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(),
+ BasePtr->getType()->getPointerAddressSpace());
+
+ const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart();
+ if (!SE->isLoopInvariant(BasePtrStartSCEV, L))
+ return MadeChange;
+
+ const SCEVConstant *BasePtrIncSCEV =
+ dyn_cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE));
+ if (!BasePtrIncSCEV)
+ return MadeChange;
+ BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV);
+ if (!isSafeToExpand(BasePtrStartSCEV, *SE))
+ return MadeChange;
+
+ if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV))
+ return MadeChange;
+
+ LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
+
+ BasicBlock *Header = L->getHeader();
+ unsigned HeaderLoopPredCount = pred_size(Header);
+ BasicBlock *LoopPredecessor = L->getLoopPredecessor();
+
+ PHINode *NewPHI =
+ PHINode::Create(I8PtrTy, HeaderLoopPredCount,
+ getInstrName(MemI, PHINodeNameSuffix),
+ Header->getFirstNonPHI());
+
+ SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart");
+ Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy,
+ LoopPredecessor->getTerminator());
+
+ // Note that LoopPredecessor might occur in the predecessor list multiple
+ // times, and we need to add it the right number of times.
+ for (const auto &PI : predecessors(Header)) {
+ if (PI != LoopPredecessor)
+ continue;
+
+ NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
+ }
+
+ Instruction *InsPoint = &*Header->getFirstInsertionPt();
+ GetElementPtrInst *PtrInc = GetElementPtrInst::Create(
+ I8Ty, NewPHI, BasePtrIncSCEV->getValue(),
+ getInstrName(MemI, GEPNodeIncNameSuffix), InsPoint);
+ PtrInc->setIsInBounds(IsPtrInBounds(BasePtr));
+ for (const auto &PI : predecessors(Header)) {
+ if (PI == LoopPredecessor)
+ continue;
+
+ NewPHI->addIncoming(PtrInc, PI);
+ }
+
+ Instruction *NewBasePtr;
+ if (PtrInc->getType() != BasePtr->getType())
+ NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(),
+ getInstrName(PtrInc, CastNodeNameSuffix), InsPoint);
+ else
+ NewBasePtr = PtrInc;
+
+ if (Instruction *IDel = dyn_cast<Instruction>(BasePtr))
+ BBChanged.insert(IDel->getParent());
+ BasePtr->replaceAllUsesWith(NewBasePtr);
+ RecursivelyDeleteTriviallyDeadInstructions(BasePtr);
+
+ // Keep track of the replacement pointer values we've inserted so that we
+ // don't generate more pointer values than necessary.
+ SmallPtrSet<Value *, 16> NewPtrs;
+ NewPtrs.insert(NewBasePtr);
+
+ for (auto I = std::next(BucketChain.Elements.begin()),
+ IE = BucketChain.Elements.end(); I != IE; ++I) {
+ Value *Ptr = GetPointerOperand(I->Instr);
+ assert(Ptr && "No pointer operand");
+ if (NewPtrs.count(Ptr))
+ continue;
+
+ Instruction *RealNewPtr;
+ if (!I->Offset || I->Offset->getValue()->isZero()) {
+ RealNewPtr = NewBasePtr;
+ } else {
+ Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
+ if (PtrIP && isa<Instruction>(NewBasePtr) &&
+ cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
+ PtrIP = nullptr;
+ else if (PtrIP && isa<PHINode>(PtrIP))
+ PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
+ else if (!PtrIP)
+ PtrIP = I->Instr;
+
+ GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
+ I8Ty, PtrInc, I->Offset->getValue(),
+ getInstrName(I->Instr, GEPNodeOffNameSuffix), PtrIP);
+ if (!PtrIP)
+ NewPtr->insertAfter(cast<Instruction>(PtrInc));
+ NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
+ RealNewPtr = NewPtr;
+ }
+
+ if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
+ BBChanged.insert(IDel->getParent());
+
+ Instruction *ReplNewPtr;
+ if (Ptr->getType() != RealNewPtr->getType()) {
+ ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(),
+ getInstrName(Ptr, CastNodeNameSuffix));
+ ReplNewPtr->insertAfter(RealNewPtr);
+ } else
+ ReplNewPtr = RealNewPtr;
+
+ Ptr->replaceAllUsesWith(ReplNewPtr);
+ RecursivelyDeleteTriviallyDeadInstructions(Ptr);
+
+ NewPtrs.insert(RealNewPtr);
+ }
+
+ MadeChange = true;
+ UpdFormChainRewritten++;
+
+ return MadeChange;
+}
+
+bool PPCLoopPreIncPrep::updateFormPrep(Loop *L,
+ SmallVector<Bucket, 16> &Buckets) {
+ bool MadeChange = false;
+ if (Buckets.empty())
+ return MadeChange;
+ SmallSet<BasicBlock *, 16> BBChanged;
+ for (auto &Bucket : Buckets)
+ // The base address of each bucket is transformed into a phi and the others
+ // are rewritten based on new base.
+ if (prepareBaseForUpdateFormChain(Bucket))
+ MadeChange |= rewriteLoadStores(L, Bucket, BBChanged);
+ if (MadeChange)
+ for (auto &BB : L->blocks())
+ if (BBChanged.count(BB))
+ DeleteDeadPHIs(BB);
+ return MadeChange;
+}
+
// In order to prepare for the pre-increment a PHI is added.
// This function will check to see if that PHI already exists and will return
-// true if it found an existing PHI with the same start and increment as the
-// one we wanted to create.
+// true if it found an existing PHI with the same start and increment as the
+// one we wanted to create.
bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI,
const SCEV *BasePtrStartSCEV,
const SCEVConstant *BasePtrIncSCEV) {
@@ -216,10 +522,10 @@ bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI,
continue;
if (CurrentPHINode->getNumIncomingValues() == 2) {
- if ( (CurrentPHINode->getIncomingBlock(0) == LatchBB &&
- CurrentPHINode->getIncomingBlock(1) == PredBB) ||
- (CurrentPHINode->getIncomingBlock(1) == LatchBB &&
- CurrentPHINode->getIncomingBlock(0) == PredBB) ) {
+ if ((CurrentPHINode->getIncomingBlock(0) == LatchBB &&
+ CurrentPHINode->getIncomingBlock(1) == PredBB) ||
+ (CurrentPHINode->getIncomingBlock(1) == LatchBB &&
+ CurrentPHINode->getIncomingBlock(0) == PredBB)) {
if (PHIBasePtrSCEV->getStart() == BasePtrStartSCEV &&
PHIBasePtrIncSCEV == BasePtrIncSCEV) {
// The existing PHI (CurrentPHINode) has the same start and increment
@@ -242,89 +548,6 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
LLVM_DEBUG(dbgs() << "PIP: Examining: " << *L << "\n");
- BasicBlock *Header = L->getHeader();
-
- const PPCSubtarget *ST =
- TM ? TM->getSubtargetImpl(*Header->getParent()) : nullptr;
-
- unsigned HeaderLoopPredCount = pred_size(Header);
-
- // Collect buckets of comparable addresses used by loads and stores.
- SmallVector<Bucket, 16> Buckets;
- for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
- I != IE; ++I) {
- for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end();
- J != JE; ++J) {
- Value *PtrValue;
- Instruction *MemI;
-
- if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) {
- MemI = LMemI;
- PtrValue = LMemI->getPointerOperand();
- } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) {
- MemI = SMemI;
- PtrValue = SMemI->getPointerOperand();
- } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(J)) {
- if (IMemI->getIntrinsicID() == Intrinsic::prefetch) {
- MemI = IMemI;
- PtrValue = IMemI->getArgOperand(0);
- } else continue;
- } else continue;
-
- unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
- if (PtrAddrSpace)
- continue;
-
- // There are no update forms for Altivec vector load/stores.
- if (ST && ST->hasAltivec() &&
- PtrValue->getType()->getPointerElementType()->isVectorTy())
- continue;
-
- if (L->isLoopInvariant(PtrValue))
- continue;
-
- const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L);
- if (const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV)) {
- if (LARSCEV->getLoop() != L)
- continue;
- // See getPreIndexedAddressParts, the displacement for LDU/STDU has to
- // be 4's multiple (DS-form). For i64 loads/stores when the displacement
- // fits in a 16-bit signed field but isn't a multiple of 4, it will be
- // useless and possible to break some original well-form addressing mode
- // to make this pre-inc prep for it.
- if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) {
- if (const SCEVConstant *StepConst =
- dyn_cast<SCEVConstant>(LARSCEV->getStepRecurrence(*SE))) {
- const APInt &ConstInt = StepConst->getValue()->getValue();
- if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0)
- continue;
- }
- }
- } else {
- continue;
- }
-
- bool FoundBucket = false;
- for (auto &B : Buckets) {
- const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
- if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) {
- B.Elements.push_back(BucketElement(CDiff, MemI));
- FoundBucket = true;
- break;
- }
- }
-
- if (!FoundBucket) {
- if (Buckets.size() == MaxVars)
- return MadeChange;
- Buckets.push_back(Bucket(LSCEV, MemI));
- }
- }
- }
-
- if (Buckets.empty())
- return MadeChange;
-
BasicBlock *LoopPredecessor = L->getLoopPredecessor();
// If there is no loop predecessor, or the loop predecessor's terminator
// returns a value (which might contribute to determining the loop's
@@ -335,191 +558,48 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
if (LoopPredecessor)
MadeChange = true;
}
- if (!LoopPredecessor)
+ if (!LoopPredecessor) {
+ LLVM_DEBUG(dbgs() << "PIP fails since no predecessor for current loop.\n");
return MadeChange;
+ }
- LLVM_DEBUG(dbgs() << "PIP: Found " << Buckets.size() << " buckets\n");
-
- SmallSet<BasicBlock *, 16> BBChanged;
- for (unsigned i = 0, e = Buckets.size(); i != e; ++i) {
- // The base address of each bucket is transformed into a phi and the others
- // are rewritten as offsets of that variable.
-
- // We have a choice now of which instruction's memory operand we use as the
- // base for the generated PHI. Always picking the first instruction in each
- // bucket does not work well, specifically because that instruction might
- // be a prefetch (and there are no pre-increment dcbt variants). Otherwise,
- // the choice is somewhat arbitrary, because the backend will happily
- // generate direct offsets from both the pre-incremented and
- // post-incremented pointer values. Thus, we'll pick the first non-prefetch
- // instruction in each bucket, and adjust the recurrence and other offsets
- // accordingly.
- for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) {
- if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr))
- if (II->getIntrinsicID() == Intrinsic::prefetch)
- continue;
-
- // If we'd otherwise pick the first element anyway, there's nothing to do.
- if (j == 0)
- break;
-
- // If our chosen element has no offset from the base pointer, there's
- // nothing to do.
- if (!Buckets[i].Elements[j].Offset ||
- Buckets[i].Elements[j].Offset->isZero())
- break;
-
- const SCEV *Offset = Buckets[i].Elements[j].Offset;
- Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset);
- for (auto &E : Buckets[i].Elements) {
- if (E.Offset)
- E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
- else
- E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
- }
-
- std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]);
- break;
- }
-
- const SCEVAddRecExpr *BasePtrSCEV =
- cast<SCEVAddRecExpr>(Buckets[i].BaseSCEV);
- if (!BasePtrSCEV->isAffine())
- continue;
-
- LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
- assert(BasePtrSCEV->getLoop() == L &&
- "AddRec for the wrong loop?");
-
- // The instruction corresponding to the Bucket's BaseSCEV must be the first
- // in the vector of elements.
- Instruction *MemI = Buckets[i].Elements.begin()->Instr;
- Value *BasePtr = GetPointerOperand(MemI);
- assert(BasePtr && "No pointer operand");
-
- Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext());
- Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(),
- BasePtr->getType()->getPointerAddressSpace());
-
- const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart();
- if (!SE->isLoopInvariant(BasePtrStartSCEV, L))
- continue;
-
- const SCEVConstant *BasePtrIncSCEV =
- dyn_cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE));
- if (!BasePtrIncSCEV)
- continue;
- BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV);
- if (!isSafeToExpand(BasePtrStartSCEV, *SE))
- continue;
-
- LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
-
- if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV))
- continue;
-
- PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount,
- MemI->hasName() ? MemI->getName() + ".phi" : "",
- Header->getFirstNonPHI());
-
- SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart");
- Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy,
- LoopPredecessor->getTerminator());
-
- // Note that LoopPredecessor might occur in the predecessor list multiple
- // times, and we need to add it the right number of times.
- for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
- PI != PE; ++PI) {
- if (*PI != LoopPredecessor)
- continue;
-
- NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
- }
-
- Instruction *InsPoint = &*Header->getFirstInsertionPt();
- GetElementPtrInst *PtrInc = GetElementPtrInst::Create(
- I8Ty, NewPHI, BasePtrIncSCEV->getValue(),
- MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint);
- PtrInc->setIsInBounds(IsPtrInBounds(BasePtr));
- for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
- PI != PE; ++PI) {
- if (*PI == LoopPredecessor)
- continue;
-
- NewPHI->addIncoming(PtrInc, *PI);
- }
-
- Instruction *NewBasePtr;
- if (PtrInc->getType() != BasePtr->getType())
- NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(),
- PtrInc->hasName() ? PtrInc->getName() + ".cast" : "", InsPoint);
- else
- NewBasePtr = PtrInc;
-
- if (Instruction *IDel = dyn_cast<Instruction>(BasePtr))
- BBChanged.insert(IDel->getParent());
- BasePtr->replaceAllUsesWith(NewBasePtr);
- RecursivelyDeleteTriviallyDeadInstructions(BasePtr);
-
- // Keep track of the replacement pointer values we've inserted so that we
- // don't generate more pointer values than necessary.
- SmallPtrSet<Value *, 16> NewPtrs;
- NewPtrs.insert( NewBasePtr);
-
- for (auto I = std::next(Buckets[i].Elements.begin()),
- IE = Buckets[i].Elements.end(); I != IE; ++I) {
- Value *Ptr = GetPointerOperand(I->Instr);
- assert(Ptr && "No pointer operand");
- if (NewPtrs.count(Ptr))
- continue;
-
- Instruction *RealNewPtr;
- if (!I->Offset || I->Offset->getValue()->isZero()) {
- RealNewPtr = NewBasePtr;
- } else {
- Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
- if (PtrIP && isa<Instruction>(NewBasePtr) &&
- cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
- PtrIP = nullptr;
- else if (isa<PHINode>(PtrIP))
- PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
- else if (!PtrIP)
- PtrIP = I->Instr;
-
- GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
- I8Ty, PtrInc, I->Offset->getValue(),
- I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP);
- if (!PtrIP)
- NewPtr->insertAfter(cast<Instruction>(PtrInc));
- NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
- RealNewPtr = NewPtr;
+ // Check if a load/store has update form. This lambda is used by function
+ // collectCandidates which can collect candidates for types defined by lambda.
+ auto isUpdateFormCandidate = [&] (const Instruction *I,
+ const Value *PtrValue) {
+ assert((PtrValue && I) && "Invalid parameter!");
+ // There are no update forms for Altivec vector load/stores.
+ if (ST && ST->hasAltivec() &&
+ PtrValue->getType()->getPointerElementType()->isVectorTy())
+ return false;
+ // See getPreIndexedAddressParts, the displacement for LDU/STDU has to
+ // be 4's multiple (DS-form). For i64 loads/stores when the displacement
+ // fits in a 16-bit signed field but isn't a multiple of 4, it will be
+ // useless and possible to break some original well-form addressing mode
+ // to make this pre-inc prep for it.
+ if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) {
+ const SCEV *LSCEV = SE->getSCEVAtScope(const_cast<Value *>(PtrValue), L);
+ const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV);
+ if (!LARSCEV || LARSCEV->getLoop() != L)
+ return false;
+ if (const SCEVConstant *StepConst =
+ dyn_cast<SCEVConstant>(LARSCEV->getStepRecurrence(*SE))) {
+ const APInt &ConstInt = StepConst->getValue()->getValue();
+ if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0)
+ return false;
}
-
- if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
- BBChanged.insert(IDel->getParent());
-
- Instruction *ReplNewPtr;
- if (Ptr->getType() != RealNewPtr->getType()) {
- ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(),
- Ptr->hasName() ? Ptr->getName() + ".cast" : "");
- ReplNewPtr->insertAfter(RealNewPtr);
- } else
- ReplNewPtr = RealNewPtr;
-
- Ptr->replaceAllUsesWith(ReplNewPtr);
- RecursivelyDeleteTriviallyDeadInstructions(Ptr);
-
- NewPtrs.insert(RealNewPtr);
}
+ return true;
+ };
- MadeChange = true;
- }
+ // Collect buckets of comparable addresses used by loads, stores and prefetch
+ // intrinsic for update form.
+ SmallVector<Bucket, 16> UpdateFormBuckets =
+ collectCandidates(L, isUpdateFormCandidate, MaxVars);
- for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
- I != IE; ++I) {
- if (BBChanged.count(*I))
- DeleteDeadPHIs(*I);
- }
+ // Prepare for update form.
+ if (!UpdateFormBuckets.empty())
+ MadeChange |= updateFormPrep(L, UpdateFormBuckets);
return MadeChange;
}
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 027e6bd1ba06..b6496f189a3a 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -79,7 +79,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO,
}
static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
- AsmPrinter &Printer, bool isDarwin) {
+ AsmPrinter &Printer, bool IsDarwin) {
MCContext &Ctx = Printer.OutContext;
MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
@@ -137,10 +137,10 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
// Add ha16() / lo16() markers if required.
switch (access) {
case PPCII::MO_LO:
- Expr = PPCMCExpr::createLo(Expr, isDarwin, Ctx);
+ Expr = PPCMCExpr::createLo(Expr, IsDarwin, Ctx);
break;
case PPCII::MO_HA:
- Expr = PPCMCExpr::createHa(Expr, isDarwin, Ctx);
+ Expr = PPCMCExpr::createHa(Expr, IsDarwin, Ctx);
break;
}
@@ -148,20 +148,20 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
}
void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
- AsmPrinter &AP, bool isDarwin) {
+ AsmPrinter &AP, bool IsDarwin) {
OutMI.setOpcode(MI->getOpcode());
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
MCOperand MCOp;
if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP,
- isDarwin))
+ IsDarwin))
OutMI.addOperand(MCOp);
}
}
bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
MCOperand &OutMO, AsmPrinter &AP,
- bool isDarwin) {
+ bool IsDarwin) {
switch (MO.getType()) {
default:
llvm_unreachable("unknown operand type");
@@ -181,17 +181,20 @@ bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
return true;
case MachineOperand::MO_GlobalAddress:
case MachineOperand::MO_ExternalSymbol:
- OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin);
+ OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, IsDarwin);
return true;
case MachineOperand::MO_JumpTableIndex:
- OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin);
+ OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, IsDarwin);
return true;
case MachineOperand::MO_ConstantPoolIndex:
- OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin);
+ OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, IsDarwin);
return true;
case MachineOperand::MO_BlockAddress:
OutMO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP,
- isDarwin);
+ IsDarwin);
+ return true;
+ case MachineOperand::MO_MCSymbol:
+ OutMO = GetSymbolRef(MO, MO.getMCSymbol(), AP, IsDarwin);
return true;
case MachineOperand::MO_RegisterMask:
return false;
diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp
index 446246358e96..ac8ac060f460 100644
--- a/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -148,8 +148,8 @@ static MachineInstr *getVRegDefOrNull(MachineOperand *Op,
if (!Op->isReg())
return nullptr;
- unsigned Reg = Op->getReg();
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ Register Reg = Op->getReg();
+ if (!Register::isVirtualRegister(Reg))
return nullptr;
return MRI->getVRegDef(Reg);
@@ -344,8 +344,7 @@ bool PPCMIPeephole::simplifyCode(void) {
unsigned TrueReg2 =
TRI->lookThruCopyLike(MI.getOperand(2).getReg(), MRI);
- if (TrueReg1 == TrueReg2
- && TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
+ if (TrueReg1 == TrueReg2 && Register::isVirtualRegister(TrueReg1)) {
MachineInstr *DefMI = MRI->getVRegDef(TrueReg1);
unsigned DefOpc = DefMI ? DefMI->getOpcode() : 0;
@@ -358,7 +357,7 @@ bool PPCMIPeephole::simplifyCode(void) {
return false;
unsigned DefReg =
TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
- if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
+ if (Register::isVirtualRegister(DefReg)) {
MachineInstr *LoadMI = MRI->getVRegDef(DefReg);
if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX)
return true;
@@ -444,7 +443,7 @@ bool PPCMIPeephole::simplifyCode(void) {
unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
unsigned TrueReg =
TRI->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI);
- if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
+ if (!Register::isVirtualRegister(TrueReg))
break;
MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
if (!DefMI)
@@ -453,8 +452,8 @@ bool PPCMIPeephole::simplifyCode(void) {
auto isConvertOfSplat = [=]() -> bool {
if (DefOpcode != PPC::XVCVSPSXWS && DefOpcode != PPC::XVCVSPUXWS)
return false;
- unsigned ConvReg = DefMI->getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(ConvReg))
+ Register ConvReg = DefMI->getOperand(1).getReg();
+ if (!Register::isVirtualRegister(ConvReg))
return false;
MachineInstr *Splt = MRI->getVRegDef(ConvReg);
return Splt && (Splt->getOpcode() == PPC::LXVWSX ||
@@ -481,9 +480,9 @@ bool PPCMIPeephole::simplifyCode(void) {
// Splat fed by a shift. Usually when we align value to splat into
// vector element zero.
if (DefOpcode == PPC::XXSLDWI) {
- unsigned ShiftRes = DefMI->getOperand(0).getReg();
- unsigned ShiftOp1 = DefMI->getOperand(1).getReg();
- unsigned ShiftOp2 = DefMI->getOperand(2).getReg();
+ Register ShiftRes = DefMI->getOperand(0).getReg();
+ Register ShiftOp1 = DefMI->getOperand(1).getReg();
+ Register ShiftOp2 = DefMI->getOperand(2).getReg();
unsigned ShiftImm = DefMI->getOperand(3).getImm();
unsigned SplatImm = MI.getOperand(2).getImm();
if (ShiftOp1 == ShiftOp2) {
@@ -507,7 +506,7 @@ bool PPCMIPeephole::simplifyCode(void) {
// If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant.
unsigned TrueReg =
TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
- if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
+ if (!Register::isVirtualRegister(TrueReg))
break;
MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
@@ -518,8 +517,8 @@ bool PPCMIPeephole::simplifyCode(void) {
TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
unsigned DefsReg2 =
TRI->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
- if (!TargetRegisterInfo::isVirtualRegister(DefsReg1) ||
- !TargetRegisterInfo::isVirtualRegister(DefsReg2))
+ if (!Register::isVirtualRegister(DefsReg1) ||
+ !Register::isVirtualRegister(DefsReg2))
break;
MachineInstr *P1 = MRI->getVRegDef(DefsReg1);
MachineInstr *P2 = MRI->getVRegDef(DefsReg2);
@@ -533,8 +532,8 @@ bool PPCMIPeephole::simplifyCode(void) {
if (RoundInstr->getOpcode() == PPC::FRSP &&
MRI->hasOneNonDBGUse(RoundInstr->getOperand(0).getReg())) {
Simplified = true;
- unsigned ConvReg1 = RoundInstr->getOperand(1).getReg();
- unsigned FRSPDefines = RoundInstr->getOperand(0).getReg();
+ Register ConvReg1 = RoundInstr->getOperand(1).getReg();
+ Register FRSPDefines = RoundInstr->getOperand(0).getReg();
MachineInstr &Use = *(MRI->use_instr_begin(FRSPDefines));
for (int i = 0, e = Use.getNumOperands(); i < e; ++i)
if (Use.getOperand(i).isReg() &&
@@ -566,8 +565,8 @@ bool PPCMIPeephole::simplifyCode(void) {
case PPC::EXTSH8:
case PPC::EXTSH8_32_64: {
if (!EnableSExtElimination) break;
- unsigned NarrowReg = MI.getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(NarrowReg))
+ Register NarrowReg = MI.getOperand(1).getReg();
+ if (!Register::isVirtualRegister(NarrowReg))
break;
MachineInstr *SrcMI = MRI->getVRegDef(NarrowReg);
@@ -610,8 +609,8 @@ bool PPCMIPeephole::simplifyCode(void) {
case PPC::EXTSW_32:
case PPC::EXTSW_32_64: {
if (!EnableSExtElimination) break;
- unsigned NarrowReg = MI.getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(NarrowReg))
+ Register NarrowReg = MI.getOperand(1).getReg();
+ if (!Register::isVirtualRegister(NarrowReg))
break;
MachineInstr *SrcMI = MRI->getVRegDef(NarrowReg);
@@ -652,8 +651,8 @@ bool PPCMIPeephole::simplifyCode(void) {
// We can eliminate EXTSW if the input is known to be already
// sign-extended.
LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n");
- unsigned TmpReg =
- MF->getRegInfo().createVirtualRegister(&PPC::G8RCRegClass);
+ Register TmpReg =
+ MF->getRegInfo().createVirtualRegister(&PPC::G8RCRegClass);
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::IMPLICIT_DEF),
TmpReg);
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::INSERT_SUBREG),
@@ -679,8 +678,8 @@ bool PPCMIPeephole::simplifyCode(void) {
if (MI.getOperand(2).getImm() != 0)
break;
- unsigned SrcReg = MI.getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (!Register::isVirtualRegister(SrcReg))
break;
MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
@@ -695,8 +694,8 @@ bool PPCMIPeephole::simplifyCode(void) {
SrcMI = SubRegMI;
if (SubRegMI->getOpcode() == PPC::COPY) {
- unsigned CopyReg = SubRegMI->getOperand(1).getReg();
- if (TargetRegisterInfo::isVirtualRegister(CopyReg))
+ Register CopyReg = SubRegMI->getOperand(1).getReg();
+ if (Register::isVirtualRegister(CopyReg))
SrcMI = MRI->getVRegDef(CopyReg);
}
@@ -757,7 +756,7 @@ bool PPCMIPeephole::simplifyCode(void) {
break; // We don't have an ADD fed by LI's that can be transformed
// Now we know that Op1 is the PHI node and Op2 is the dominator
- unsigned DominatorReg = Op2.getReg();
+ Register DominatorReg = Op2.getReg();
const TargetRegisterClass *TRC = MI.getOpcode() == PPC::ADD8
? &PPC::G8RC_and_G8RC_NOX0RegClass
@@ -927,7 +926,7 @@ static unsigned getSrcVReg(unsigned Reg, MachineBasicBlock *BB1,
}
else if (Inst->isFullCopy())
NextReg = Inst->getOperand(1).getReg();
- if (NextReg == SrcReg || !TargetRegisterInfo::isVirtualRegister(NextReg))
+ if (NextReg == SrcReg || !Register::isVirtualRegister(NextReg))
break;
SrcReg = NextReg;
}
@@ -949,9 +948,8 @@ static bool eligibleForCompareElimination(MachineBasicBlock &MBB,
(*BII).getOpcode() == PPC::BCC &&
(*BII).getOperand(1).isReg()) {
// We optimize only if the condition code is used only by one BCC.
- unsigned CndReg = (*BII).getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(CndReg) ||
- !MRI->hasOneNonDBGUse(CndReg))
+ Register CndReg = (*BII).getOperand(1).getReg();
+ if (!Register::isVirtualRegister(CndReg) || !MRI->hasOneNonDBGUse(CndReg))
return false;
MachineInstr *CMPI = MRI->getVRegDef(CndReg);
@@ -961,7 +959,7 @@ static bool eligibleForCompareElimination(MachineBasicBlock &MBB,
// We skip this BB if a physical register is used in comparison.
for (MachineOperand &MO : CMPI->operands())
- if (MO.isReg() && !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ if (MO.isReg() && !Register::isVirtualRegister(MO.getReg()))
return false;
return true;
@@ -1271,8 +1269,8 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) {
// We touch up the compare instruction in MBB2 and move it to
// a previous BB to handle partially redundant case.
if (SwapOperands) {
- unsigned Op1 = CMPI2->getOperand(1).getReg();
- unsigned Op2 = CMPI2->getOperand(2).getReg();
+ Register Op1 = CMPI2->getOperand(1).getReg();
+ Register Op2 = CMPI2->getOperand(2).getReg();
CMPI2->getOperand(1).setReg(Op2);
CMPI2->getOperand(2).setReg(Op1);
}
@@ -1295,7 +1293,7 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) {
MBBtoMoveCmp->splice(I, &MBB2, MachineBasicBlock::iterator(CMPI2));
DebugLoc DL = CMPI2->getDebugLoc();
- unsigned NewVReg = MRI->createVirtualRegister(&PPC::CRRCRegClass);
+ Register NewVReg = MRI->createVirtualRegister(&PPC::CRRCRegClass);
BuildMI(MBB2, MBB2.begin(), DL,
TII->get(PPC::PHI), NewVReg)
.addReg(BI1->getOperand(1).getReg()).addMBB(MBB1)
@@ -1334,8 +1332,8 @@ bool PPCMIPeephole::emitRLDICWhenLoweringJumpTables(MachineInstr &MI) {
if (MI.getOpcode() != PPC::RLDICR)
return false;
- unsigned SrcReg = MI.getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (!Register::isVirtualRegister(SrcReg))
return false;
MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
@@ -1414,8 +1412,8 @@ bool PPCMIPeephole::combineSEXTAndSHL(MachineInstr &MI,
if (SHMI + MEMI != 63)
return false;
- unsigned SrcReg = MI.getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (!Register::isVirtualRegister(SrcReg))
return false;
MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
@@ -1428,6 +1426,12 @@ bool PPCMIPeephole::combineSEXTAndSHL(MachineInstr &MI,
if (!MRI->hasOneNonDBGUse(SrcReg))
return false;
+ assert(SrcMI->getNumOperands() == 2 && "EXTSW should have 2 operands");
+ assert(SrcMI->getOperand(1).isReg() &&
+ "EXTSW's second operand should be a register");
+ if (!Register::isVirtualRegister(SrcMI->getOperand(1).getReg()))
+ return false;
+
LLVM_DEBUG(dbgs() << "Combining pair: ");
LLVM_DEBUG(SrcMI->dump());
LLVM_DEBUG(MI.dump());
diff --git a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index d83c92276800..b1c0433641dd 100644
--- a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -57,6 +57,109 @@ namespace {
MachineFunctionProperties::Property::NoVRegs);
}
+ // This function removes any redundant load immediates. It has two level
+ // loops - The outer loop finds the load immediates BBI that could be used
+ // to replace following redundancy. The inner loop scans instructions that
+ // after BBI to find redundancy and update kill/dead flags accordingly. If
+ // AfterBBI is the same as BBI, it is redundant, otherwise any instructions
+ // that modify the def register of BBI would break the scanning.
+ // DeadOrKillToUnset is a pointer to the previous operand that had the
+ // kill/dead flag set. It keeps track of the def register of BBI, the use
+ // registers of AfterBBIs and the def registers of AfterBBIs.
+ bool removeRedundantLIs(MachineBasicBlock &MBB,
+ const TargetRegisterInfo *TRI) {
+ LLVM_DEBUG(dbgs() << "Remove redundant load immediates from MBB:\n";
+ MBB.dump(); dbgs() << "\n");
+
+ DenseSet<MachineInstr *> InstrsToErase;
+ for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) {
+ // Skip load immediate that is marked to be erased later because it
+ // cannot be used to replace any other instructions.
+ if (InstrsToErase.find(&*BBI) != InstrsToErase.end())
+ continue;
+ // Skip non-load immediate.
+ unsigned Opc = BBI->getOpcode();
+ if (Opc != PPC::LI && Opc != PPC::LI8 && Opc != PPC::LIS &&
+ Opc != PPC::LIS8)
+ continue;
+ // Skip load immediate, where the operand is a relocation (e.g., $r3 =
+ // LI target-flags(ppc-lo) %const.0).
+ if (!BBI->getOperand(1).isImm())
+ continue;
+ assert(BBI->getOperand(0).isReg() &&
+ "Expected a register for the first operand");
+
+ LLVM_DEBUG(dbgs() << "Scanning after load immediate: "; BBI->dump(););
+
+ Register Reg = BBI->getOperand(0).getReg();
+ int64_t Imm = BBI->getOperand(1).getImm();
+ MachineOperand *DeadOrKillToUnset = nullptr;
+ if (BBI->getOperand(0).isDead()) {
+ DeadOrKillToUnset = &BBI->getOperand(0);
+ LLVM_DEBUG(dbgs() << " Kill flag of " << *DeadOrKillToUnset
+ << " from load immediate " << *BBI
+ << " is a unsetting candidate\n");
+ }
+ // This loop scans instructions after BBI to see if there is any
+ // redundant load immediate.
+ for (auto AfterBBI = std::next(BBI); AfterBBI != MBB.instr_end();
+ ++AfterBBI) {
+ // Track the operand that kill Reg. We would unset the kill flag of
+ // the operand if there is a following redundant load immediate.
+ int KillIdx = AfterBBI->findRegisterUseOperandIdx(Reg, true, TRI);
+ if (KillIdx != -1) {
+ assert(!DeadOrKillToUnset && "Shouldn't kill same register twice");
+ DeadOrKillToUnset = &AfterBBI->getOperand(KillIdx);
+ LLVM_DEBUG(dbgs()
+ << " Kill flag of " << *DeadOrKillToUnset << " from "
+ << *AfterBBI << " is a unsetting candidate\n");
+ }
+
+ if (!AfterBBI->modifiesRegister(Reg, TRI))
+ continue;
+ // Finish scanning because Reg is overwritten by a non-load
+ // instruction.
+ if (AfterBBI->getOpcode() != Opc)
+ break;
+ assert(AfterBBI->getOperand(0).isReg() &&
+ "Expected a register for the first operand");
+ // Finish scanning because Reg is overwritten by a relocation or a
+ // different value.
+ if (!AfterBBI->getOperand(1).isImm() ||
+ AfterBBI->getOperand(1).getImm() != Imm)
+ break;
+
+ // It loads same immediate value to the same Reg, which is redundant.
+ // We would unset kill flag in previous Reg usage to extend live range
+ // of Reg first, then remove the redundancy.
+ if (DeadOrKillToUnset) {
+ LLVM_DEBUG(dbgs()
+ << " Unset dead/kill flag of " << *DeadOrKillToUnset
+ << " from " << *DeadOrKillToUnset->getParent());
+ if (DeadOrKillToUnset->isDef())
+ DeadOrKillToUnset->setIsDead(false);
+ else
+ DeadOrKillToUnset->setIsKill(false);
+ }
+ DeadOrKillToUnset =
+ AfterBBI->findRegisterDefOperand(Reg, true, true, TRI);
+ if (DeadOrKillToUnset)
+ LLVM_DEBUG(dbgs()
+ << " Dead flag of " << *DeadOrKillToUnset << " from "
+ << *AfterBBI << " is a unsetting candidate\n");
+ InstrsToErase.insert(&*AfterBBI);
+ LLVM_DEBUG(dbgs() << " Remove redundant load immediate: ";
+ AfterBBI->dump());
+ }
+ }
+
+ for (MachineInstr *MI : InstrsToErase) {
+ MI->eraseFromParent();
+ }
+ NumRemovedInPreEmit += InstrsToErase.size();
+ return !InstrsToErase.empty();
+ }
+
bool runOnMachineFunction(MachineFunction &MF) override {
if (skipFunction(MF.getFunction()) || !RunPreEmitPeephole)
return false;
@@ -65,6 +168,7 @@ namespace {
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
SmallVector<MachineInstr *, 4> InstrsToErase;
for (MachineBasicBlock &MBB : MF) {
+ Changed |= removeRedundantLIs(MBB, TRI);
for (MachineInstr &MI : MBB) {
unsigned Opc = MI.getOpcode();
// Detect self copies - these can result from running AADB.
@@ -111,7 +215,7 @@ namespace {
if (Br->getOpcode() != PPC::BC && Br->getOpcode() != PPC::BCn)
continue;
MachineInstr *CRSetMI = nullptr;
- unsigned CRBit = Br->getOperand(0).getReg();
+ Register CRBit = Br->getOperand(0).getReg();
unsigned CRReg = getCRFromCRBit(CRBit);
bool SeenUse = false;
MachineBasicBlock::reverse_iterator It = Br, Er = MBB.rend();
diff --git a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
index 3a83cc27439c..6e9042643820 100644
--- a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
+++ b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
@@ -79,8 +79,8 @@ bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
for (auto SI = Splats.begin(); SI != Splats.end();) {
MachineInstr *SMI = *SI;
- unsigned SplatReg = SMI->getOperand(0).getReg();
- unsigned SrcReg = SMI->getOperand(1).getReg();
+ Register SplatReg = SMI->getOperand(0).getReg();
+ Register SrcReg = SMI->getOperand(1).getReg();
if (MI->modifiesRegister(SrcReg, TRI)) {
switch (MI->getOpcode()) {
@@ -102,7 +102,7 @@ bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
// the QPX splat source register.
unsigned SubRegIndex =
TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg());
- unsigned SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex);
+ Register SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex);
// Substitute both the explicit defined register, and also the
// implicit def of the containing QPX register.
diff --git a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
index 8eaa6dfe2bf7..3b71ed219c17 100644
--- a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
+++ b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -381,10 +381,10 @@ private:
const MachineBranchProbabilityInfo *MBPI;
// A vector to contain all the CR logical operations
- std::vector<CRLogicalOpInfo> AllCRLogicalOps;
+ SmallVector<CRLogicalOpInfo, 16> AllCRLogicalOps;
void initialize(MachineFunction &MFParm);
void collectCRLogicals();
- bool handleCROp(CRLogicalOpInfo &CRI);
+ bool handleCROp(unsigned Idx);
bool splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI);
static bool isCRLogical(MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
@@ -398,7 +398,7 @@ private:
// Not using a range-based for loop here as the vector may grow while being
// operated on.
for (unsigned i = 0; i < AllCRLogicalOps.size(); i++)
- Changed |= handleCROp(AllCRLogicalOps[i]);
+ Changed |= handleCROp(i);
return Changed;
}
@@ -535,15 +535,15 @@ MachineInstr *PPCReduceCRLogicals::lookThroughCRCopy(unsigned Reg,
unsigned &Subreg,
MachineInstr *&CpDef) {
Subreg = -1;
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
return nullptr;
MachineInstr *Copy = MRI->getVRegDef(Reg);
CpDef = Copy;
if (!Copy->isCopy())
return Copy;
- unsigned CopySrc = Copy->getOperand(1).getReg();
+ Register CopySrc = Copy->getOperand(1).getReg();
Subreg = Copy->getOperand(1).getSubReg();
- if (!TargetRegisterInfo::isVirtualRegister(CopySrc)) {
+ if (!Register::isVirtualRegister(CopySrc)) {
const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
// Set the Subreg
if (CopySrc == PPC::CR0EQ || CopySrc == PPC::CR6EQ)
@@ -578,10 +578,11 @@ void PPCReduceCRLogicals::initialize(MachineFunction &MFParam) {
/// a unary CR logical might be used to change the condition code on a
/// comparison feeding it. A nullary CR logical might simply be removable
/// if the user of the bit it [un]sets can be transformed.
-bool PPCReduceCRLogicals::handleCROp(CRLogicalOpInfo &CRI) {
+bool PPCReduceCRLogicals::handleCROp(unsigned Idx) {
// We can definitely split a block on the inputs to a binary CR operation
// whose defs and (single) use are within the same block.
bool Changed = false;
+ CRLogicalOpInfo CRI = AllCRLogicalOps[Idx];
if (CRI.IsBinary && CRI.ContainedInBlock && CRI.SingleUse && CRI.FeedsBR &&
CRI.DefsSingleUse) {
Changed = splitBlockOnBinaryCROp(CRI);
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 12554ea8d079..9ec26a19bdaa 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -325,13 +325,13 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
bool IsPositionIndependent = TM.isPositionIndependent();
if (hasBasePointer(MF)) {
- if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
+ if (Subtarget.is32BitELFABI() && IsPositionIndependent)
markSuperRegs(Reserved, PPC::R29);
else
markSuperRegs(Reserved, PPC::R30);
}
- if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
+ if (Subtarget.is32BitELFABI() && IsPositionIndependent)
markSuperRegs(Reserved, PPC::R30);
// Reserve Altivec registers when Altivec is unavailable.
@@ -391,7 +391,7 @@ bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) co
bool PPCRegisterInfo::isCallerPreservedPhysReg(unsigned PhysReg,
const MachineFunction &MF) const {
- assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
+ assert(Register::isPhysicalRegister(PhysReg));
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
const MachineFrameInfo &MFI = MF.getFrameInfo();
if (!TM.isPPC64())
@@ -425,7 +425,6 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
case PPC::G8RC_NOX0RegClassID:
case PPC::GPRC_NOR0RegClassID:
case PPC::SPERCRegClassID:
- case PPC::SPE4RCRegClassID:
case PPC::G8RCRegClassID:
case PPC::GPRCRegClassID: {
unsigned FP = TFI->hasFP(MF) ? 1 : 0;
@@ -527,7 +526,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
// Fortunately, a frame greater than 32K is rare.
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+ Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
if (LP64)
@@ -549,7 +548,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
}
bool KillNegSizeReg = MI.getOperand(1).isKill();
- unsigned NegSizeReg = MI.getOperand(1).getReg();
+ Register NegSizeReg = MI.getOperand(1).getReg();
// Grow the stack and update the stack pointer link, then determine the
// address of new allocated space.
@@ -655,8 +654,8 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
- unsigned SrcReg = MI.getOperand(0).getReg();
+ Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+ Register SrcReg = MI.getOperand(0).getReg();
// We need to store the CR in the low 4-bits of the saved value. First, issue
// an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg.
@@ -700,8 +699,8 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+ Register DestReg = MI.getOperand(0).getReg();
assert(MI.definesRegister(DestReg) &&
"RESTORE_CR does not define its destination");
@@ -744,8 +743,8 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
- unsigned SrcReg = MI.getOperand(0).getReg();
+ Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+ Register SrcReg = MI.getOperand(0).getReg();
// Search up the BB to find the definition of the CR bit.
MachineBasicBlock::reverse_iterator Ins;
@@ -823,8 +822,8 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+ Register DestReg = MI.getOperand(0).getReg();
assert(MI.definesRegister(DestReg) &&
"RESTORE_CRBIT does not define its destination");
@@ -833,7 +832,7 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
BuildMI(MBB, II, dl, TII.get(TargetOpcode::IMPLICIT_DEF), DestReg);
- unsigned RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+ Register RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), RegO)
.addReg(getCRFromCRBit(DestReg));
@@ -870,8 +869,8 @@ void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
DebugLoc dl = MI.getDebugLoc();
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
- unsigned SrcReg = MI.getOperand(0).getReg();
+ Register Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+ Register SrcReg = MI.getOperand(0).getReg();
BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg)
.addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
@@ -896,8 +895,8 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
DebugLoc dl = MI.getDebugLoc();
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
- unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+ Register DestReg = MI.getOperand(0).getReg();
assert(MI.definesRegister(DestReg) &&
"RESTORE_VRSAVE does not define its destination");
@@ -1128,7 +1127,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
OperandBase = OffsetOperandNo;
}
- unsigned StackReg = MI.getOperand(FIOperandNum).getReg();
+ Register StackReg = MI.getOperand(FIOperandNum).getReg();
MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true);
}
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index af0dff6347a6..4719e947b172 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -253,15 +253,14 @@ def RM: PPCReg<"**ROUNDING MODE**">;
/// Register classes
// Allocate volatiles first
// then nonvolatiles in reverse order since stmw/lmw save from rN to r31
-def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12),
- (sequence "R%u", 30, 13),
- R31, R0, R1, FP, BP)> {
+def GPRC : RegisterClass<"PPC", [i32,f32], 32, (add (sequence "R%u", 2, 12),
+ (sequence "R%u", 30, 13),
+ R31, R0, R1, FP, BP)> {
// On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
// put it at the end of the list.
let AltOrders = [(add (sub GPRC, R2), R2)];
let AltOrderSelect = [{
- const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
- return S.isPPC64() && S.isSVR4ABI();
+ return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
}];
}
@@ -272,21 +271,19 @@ def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12),
// put it at the end of the list.
let AltOrders = [(add (sub G8RC, X2), X2)];
let AltOrderSelect = [{
- const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
- return S.isPPC64() && S.isSVR4ABI();
+ return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
}];
}
// For some instructions r0 is special (representing the value 0 instead of
// the value in the r0 register), and we use these register subclasses to
// prevent r0 from being allocated for use by those instructions.
-def GPRC_NOR0 : RegisterClass<"PPC", [i32], 32, (add (sub GPRC, R0), ZERO)> {
+def GPRC_NOR0 : RegisterClass<"PPC", [i32,f32], 32, (add (sub GPRC, R0), ZERO)> {
// On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
// put it at the end of the list.
let AltOrders = [(add (sub GPRC_NOR0, R2), R2)];
let AltOrderSelect = [{
- const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
- return S.isPPC64() && S.isSVR4ABI();
+ return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
}];
}
@@ -295,8 +292,7 @@ def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)> {
// put it at the end of the list.
let AltOrders = [(add (sub G8RC_NOX0, X2), X2)];
let AltOrderSelect = [{
- const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
- return S.isPPC64() && S.isSVR4ABI();
+ return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
}];
}
@@ -304,8 +300,6 @@ def SPERC : RegisterClass<"PPC", [f64], 64, (add (sequence "S%u", 2, 12),
(sequence "S%u", 30, 13),
S31, S0, S1)>;
-def SPE4RC : RegisterClass<"PPC", [f32], 32, (add GPRC)>;
-
// Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
// ABI the size of the Floating-point register save area is determined by the
// allocated non-volatile register with the lowest register number, as FP
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 6aa7528634d3..10568ed4b655 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -60,7 +60,7 @@ PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU,
InstrInfo(*this), TLInfo(TM, *this) {}
void PPCSubtarget::initializeEnvironment() {
- StackAlignment = 16;
+ StackAlignment = Align(16);
DarwinDirective = PPC::DIR_NONE;
HasMFOCRF = false;
Has64BitSupport = false;
@@ -145,7 +145,8 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (isDarwin())
HasLazyResolverStubs = true;
- if (TargetTriple.isOSNetBSD() || TargetTriple.isOSOpenBSD() ||
+ if ((TargetTriple.isOSFreeBSD() && TargetTriple.getOSMajorVersion() >= 13) ||
+ TargetTriple.isOSNetBSD() || TargetTriple.isOSOpenBSD() ||
TargetTriple.isMusl())
SecurePlt = true;
@@ -228,18 +229,13 @@ bool PPCSubtarget::enableSubRegLiveness() const {
return UseSubRegLiveness;
}
-unsigned char
-PPCSubtarget::classifyGlobalReference(const GlobalValue *GV) const {
- // Note that currently we don't generate non-pic references.
- // If a caller wants that, this will have to be updated.
-
+bool PPCSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
// Large code model always uses the TOC even for local symbols.
if (TM.getCodeModel() == CodeModel::Large)
- return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG;
-
+ return true;
if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
- return PPCII::MO_PIC_FLAG;
- return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG;
+ return false;
+ return true;
}
bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 55fec1cb6d99..d96c2893aee9 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -78,7 +78,7 @@ protected:
/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
- unsigned StackAlignment;
+ Align StackAlignment;
/// Selected instruction itineraries (one entry per itinerary class.)
InstrItineraryData InstrItins;
@@ -166,7 +166,7 @@ public:
/// getStackAlignment - Returns the minimum alignment known to hold of the
/// stack frame on entry to the function and which must be maintained by every
/// function for this subtarget.
- unsigned getStackAlignment() const { return StackAlignment; }
+ Align getStackAlignment() const { return StackAlignment; }
/// getDarwinDirective - Returns the -m directive specified for the cpu.
///
@@ -210,7 +210,11 @@ public:
/// instructions, regardless of whether we are in 32-bit or 64-bit mode.
bool has64BitSupport() const { return Has64BitSupport; }
// useSoftFloat - Return true if soft-float option is turned on.
- bool useSoftFloat() const { return !HasHardFloat; }
+ bool useSoftFloat() const {
+ if (isAIXABI() && !HasHardFloat)
+ report_fatal_error("soft-float is not yet supported on AIX.");
+ return !HasHardFloat;
+ }
/// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
/// registers in 32-bit mode when possible. This can only true if
@@ -277,11 +281,11 @@ public:
bool hasDirectMove() const { return HasDirectMove; }
bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; }
- unsigned getPlatformStackAlignment() const {
+ Align getPlatformStackAlignment() const {
if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned())
- return 32;
+ return Align(32);
- return 16;
+ return Align(16);
}
// DarwinABI has a 224-byte red zone. PPC32 SVR4ABI(Non-DarwinABI) has no
@@ -316,6 +320,9 @@ public:
bool isSVR4ABI() const { return !isDarwinABI() && !isAIXABI(); }
bool isELFv2ABI() const;
+ bool is64BitELFABI() const { return isSVR4ABI() && isPPC64(); }
+ bool is32BitELFABI() const { return isSVR4ABI() && !isPPC64(); }
+
/// Originally, this function return hasISEL(). Now we always enable it,
/// but may expand the ISEL instruction later.
bool enableEarlyIfConversion() const override { return true; }
@@ -337,9 +344,8 @@ public:
bool enableSubRegLiveness() const override;
- /// classifyGlobalReference - Classify a global variable reference for the
- /// current subtarget accourding to how we should reference it.
- unsigned char classifyGlobalReference(const GlobalValue *GV) const;
+ /// True if the GV will be accessed via an indirect symbol.
+ bool isGVIndirectSymbol(const GlobalValue *GV) const;
bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; }
};
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index fb826c4a32f1..8f313d9d01c4 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -74,8 +74,8 @@ protected:
LLVM_DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n " << MI);
- unsigned OutReg = MI.getOperand(0).getReg();
- unsigned InReg = MI.getOperand(1).getReg();
+ Register OutReg = MI.getOperand(0).getReg();
+ Register InReg = MI.getOperand(1).getReg();
DebugLoc DL = MI.getDebugLoc();
unsigned GPR3 = Is64Bit ? PPC::X3 : PPC::R3;
unsigned Opc1, Opc2;
diff --git a/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/lib/Target/PowerPC/PPCTOCRegDeps.cpp
index 3eb0569fb955..895ae6744421 100644
--- a/lib/Target/PowerPC/PPCTOCRegDeps.cpp
+++ b/lib/Target/PowerPC/PPCTOCRegDeps.cpp
@@ -95,7 +95,8 @@ namespace {
protected:
bool hasTOCLoReloc(const MachineInstr &MI) {
if (MI.getOpcode() == PPC::LDtocL ||
- MI.getOpcode() == PPC::ADDItocL)
+ MI.getOpcode() == PPC::ADDItocL ||
+ MI.getOpcode() == PPC::LWZtocL)
return true;
for (const MachineOperand &MO : MI.operands()) {
@@ -109,11 +110,15 @@ protected:
bool processBlock(MachineBasicBlock &MBB) {
bool Changed = false;
+ const bool isPPC64 =
+ MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64();
+ const unsigned TOCReg = isPPC64 ? PPC::X2 : PPC::R2;
+
for (auto &MI : MBB) {
if (!hasTOCLoReloc(MI))
continue;
- MI.addOperand(MachineOperand::CreateReg(PPC::X2,
+ MI.addOperand(MachineOperand::CreateReg(TOCReg,
false /*IsDef*/,
true /*IsImp*/));
Changed = true;
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index ce00f848dd72..abefee8b339d 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -93,7 +93,7 @@ EnableMachineCombinerPass("ppc-machine-combiner",
static cl::opt<bool>
ReduceCRLogical("ppc-reduce-cr-logicals",
cl::desc("Expand eligible cr-logical binary ops to branches"),
- cl::init(false), cl::Hidden);
+ cl::init(true), cl::Hidden);
extern "C" void LLVMInitializePowerPCTarget() {
// Register the targets
RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
@@ -185,12 +185,13 @@ static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL,
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
- // If it isn't a Mach-O file then it's going to be a linux ELF
- // object file.
if (TT.isOSDarwin())
- return llvm::make_unique<TargetLoweringObjectFileMachO>();
+ return std::make_unique<TargetLoweringObjectFileMachO>();
+
+ if (TT.isOSAIX())
+ return std::make_unique<TargetLoweringObjectFileXCOFF>();
- return llvm::make_unique<PPC64LinuxTargetObjectFile>();
+ return std::make_unique<PPC64LinuxTargetObjectFile>();
}
static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
@@ -248,10 +249,19 @@ static CodeModel::Model getEffectivePPCCodeModel(const Triple &TT,
report_fatal_error("Target does not support the kernel CodeModel", false);
return *CM;
}
- if (!TT.isOSDarwin() && !JIT &&
- (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le))
- return CodeModel::Medium;
- return CodeModel::Small;
+
+ if (JIT)
+ return CodeModel::Small;
+ if (TT.isOSAIX())
+ return CodeModel::Small;
+
+ assert(TT.isOSBinFormatELF() && "All remaining PPC OSes are ELF based.");
+
+ if (TT.isArch32Bit())
+ return CodeModel::Small;
+
+ assert(TT.isArch64Bit() && "Unsupported PPC architecture.");
+ return CodeModel::Medium;
}
@@ -259,8 +269,8 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) {
const PPCSubtarget &ST = C->MF->getSubtarget<PPCSubtarget>();
ScheduleDAGMILive *DAG =
new ScheduleDAGMILive(C, ST.usePPCPreRASchedStrategy() ?
- llvm::make_unique<PPCPreRASchedStrategy>(C) :
- llvm::make_unique<GenericScheduler>(C));
+ std::make_unique<PPCPreRASchedStrategy>(C) :
+ std::make_unique<GenericScheduler>(C));
// add DAG Mutations here.
DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
return DAG;
@@ -271,8 +281,8 @@ static ScheduleDAGInstrs *createPPCPostMachineScheduler(
const PPCSubtarget &ST = C->MF->getSubtarget<PPCSubtarget>();
ScheduleDAGMI *DAG =
new ScheduleDAGMI(C, ST.usePPCPostRASchedStrategy() ?
- llvm::make_unique<PPCPostRASchedStrategy>(C) :
- llvm::make_unique<PostGenericScheduler>(C), true);
+ std::make_unique<PPCPostRASchedStrategy>(C) :
+ std::make_unique<PostGenericScheduler>(C), true);
// add DAG Mutations here.
return DAG;
}
@@ -328,7 +338,7 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<PPCSubtarget>(
+ I = std::make_unique<PPCSubtarget>(
TargetTriple, CPU,
// FIXME: It would be good to have the subtarget additions here
// not necessary. Anything that turns them on/off (overrides) ends
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index ff3dfbfaca05..f51300c656aa 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -594,10 +594,37 @@ bool PPCTTIImpl::enableInterleavedAccessVectorization() {
return true;
}
-unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
- if (Vector && !ST->hasAltivec() && !ST->hasQPX())
- return 0;
- return ST->hasVSX() ? 64 : 32;
+unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+ assert(ClassID == GPRRC || ClassID == FPRRC ||
+ ClassID == VRRC || ClassID == VSXRC);
+ if (ST->hasVSX()) {
+ assert(ClassID == GPRRC || ClassID == VSXRC);
+ return ClassID == GPRRC ? 32 : 64;
+ }
+ assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC);
+ return 32;
+}
+
+unsigned PPCTTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const {
+ if (Vector)
+ return ST->hasVSX() ? VSXRC : VRRC;
+ else if (Ty && Ty->getScalarType()->isFloatTy())
+ return ST->hasVSX() ? VSXRC : FPRRC;
+ else
+ return GPRRC;
+}
+
+const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
+
+ switch (ClassID) {
+ default:
+ llvm_unreachable("unknown register class");
+ return "PPC::unknown register class";
+ case GPRRC: return "PPC::GPRRC";
+ case FPRRC: return "PPC::FPRRC";
+ case VRRC: return "PPC::VRRC";
+ case VSXRC: return "PPC::VSXRC";
+ }
}
unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
@@ -613,7 +640,7 @@ unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
}
-unsigned PPCTTIImpl::getCacheLineSize() {
+unsigned PPCTTIImpl::getCacheLineSize() const {
// Check first if the user specified a custom line size.
if (CacheLineSize.getNumOccurrences() > 0)
return CacheLineSize;
@@ -628,7 +655,7 @@ unsigned PPCTTIImpl::getCacheLineSize() {
return 64;
}
-unsigned PPCTTIImpl::getPrefetchDistance() {
+unsigned PPCTTIImpl::getPrefetchDistance() const {
// This seems like a reasonable default for the BG/Q (this pass is enabled, by
// default, only on the BG/Q).
return 300;
@@ -752,6 +779,35 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
return 0;
return Cost;
+
+ } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) {
+ if (ST->hasP9Altivec()) {
+ if (ISD == ISD::INSERT_VECTOR_ELT)
+ // A move-to VSR and a permute/insert. Assume vector operation cost
+ // for both (cost will be 2x on P9).
+ return vectorCostAdjustment(2, Opcode, Val, nullptr);
+
+ // It's an extract. Maybe we can do a cheap move-from VSR.
+ unsigned EltSize = Val->getScalarSizeInBits();
+ if (EltSize == 64) {
+ unsigned MfvsrdIndex = ST->isLittleEndian() ? 1 : 0;
+ if (Index == MfvsrdIndex)
+ return 1;
+ } else if (EltSize == 32) {
+ unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1;
+ if (Index == MfvsrwzIndex)
+ return 1;
+ }
+
+ // We need a vector extract (or mfvsrld). Assume vector operation cost.
+ // The cost of the load constant for a vector extract is disregarded
+ // (invariant, easily schedulable).
+ return vectorCostAdjustment(1, Opcode, Val, nullptr);
+
+ } else if (ST->hasDirectMove())
+ // Assume permute has standard cost.
+ // Assume move-to/move-from VSR have 2x standard cost.
+ return 3;
}
// Estimated cost of a load-hit-store delay. This was obtained
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 5d76ee418b69..83a70364bf68 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -72,10 +72,16 @@ public:
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();
- unsigned getNumberOfRegisters(bool Vector);
+
+ enum PPCRegisterClass {
+ GPRRC, FPRRC, VRRC, VSXRC
+ };
+ unsigned getNumberOfRegisters(unsigned ClassID) const;
+ unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;
+ const char* getRegisterClassName(unsigned ClassID) const;
unsigned getRegisterBitWidth(bool Vector) const;
- unsigned getCacheLineSize();
- unsigned getPrefetchDistance();
+ unsigned getCacheLineSize() const override;
+ unsigned getPrefetchDistance() const override;
unsigned getMaxInterleaveFactor(unsigned VF);
int vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, Type *Ty2);
int getArithmeticInstrCost(
diff --git a/lib/Target/PowerPC/PPCVSXCopy.cpp b/lib/Target/PowerPC/PPCVSXCopy.cpp
index 719ed7b63878..3463bbbdc5f0 100644
--- a/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ b/lib/Target/PowerPC/PPCVSXCopy.cpp
@@ -50,7 +50,7 @@ namespace {
bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC,
MachineRegisterInfo &MRI) {
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
return RC->hasSubClassEq(MRI.getRegClass(Reg));
} else if (RC->contains(Reg)) {
return true;
@@ -102,7 +102,7 @@ protected:
IsVSFReg(SrcMO.getReg(), MRI)) &&
"Unknown source for a VSX copy");
- unsigned NewVReg = MRI.createVirtualRegister(SrcRC);
+ Register NewVReg = MRI.createVirtualRegister(SrcRC);
BuildMI(MBB, MI, MI.getDebugLoc(),
TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
.addImm(1) // add 1, not 0, because there is no implicit clearing
@@ -124,7 +124,7 @@ protected:
"Unknown destination for a VSX copy");
// Copy the VSX value into a new VSX register of the correct subclass.
- unsigned NewVReg = MRI.createVirtualRegister(DstRC);
+ Register NewVReg = MRI.createVirtualRegister(DstRC);
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
NewVReg)
.add(SrcMO);
diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index ce78239df0a8..5e150be544ed 100644
--- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -126,8 +126,8 @@ protected:
if (!AddendMI->isFullCopy())
continue;
- unsigned AddendSrcReg = AddendMI->getOperand(1).getReg();
- if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) {
+ Register AddendSrcReg = AddendMI->getOperand(1).getReg();
+ if (Register::isVirtualRegister(AddendSrcReg)) {
if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) !=
MRI.getRegClass(AddendSrcReg))
continue;
@@ -182,12 +182,12 @@ protected:
// %5 = A-form-op %5, %5, %11;
// where %5 and %11 are both kills. This case would be skipped
// otherwise.
- unsigned OldFMAReg = MI.getOperand(0).getReg();
+ Register OldFMAReg = MI.getOperand(0).getReg();
// Find one of the product operands that is killed by this instruction.
unsigned KilledProdOp = 0, OtherProdOp = 0;
- unsigned Reg2 = MI.getOperand(2).getReg();
- unsigned Reg3 = MI.getOperand(3).getReg();
+ Register Reg2 = MI.getOperand(2).getReg();
+ Register Reg3 = MI.getOperand(3).getReg();
if (LIS->getInterval(Reg2).Query(FMAIdx).isKill()
&& Reg2 != OldFMAReg) {
KilledProdOp = 2;
@@ -208,14 +208,14 @@ protected:
// legality checks above, the live range for the addend source register
// could be extended), but it seems likely that such a trivial copy can
// be coalesced away later, and thus is not worth the effort.
- if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) &&
+ if (Register::isVirtualRegister(AddendSrcReg) &&
!LIS->getInterval(AddendSrcReg).liveAt(FMAIdx))
continue;
// Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
- unsigned KilledProdReg = MI.getOperand(KilledProdOp).getReg();
- unsigned OtherProdReg = MI.getOperand(OtherProdOp).getReg();
+ Register KilledProdReg = MI.getOperand(KilledProdOp).getReg();
+ Register OtherProdReg = MI.getOperand(OtherProdOp).getReg();
unsigned AddSubReg = AddendMI->getOperand(1).getSubReg();
unsigned KilledProdSubReg = MI.getOperand(KilledProdOp).getSubReg();
@@ -314,7 +314,7 @@ protected:
// Extend the live interval of the addend source (it might end at the
// copy to be removed, or somewhere in between there and here). This
// is necessary only if it is a physical register.
- if (!TargetRegisterInfo::isVirtualRegister(AddendSrcReg))
+ if (!Register::isVirtualRegister(AddendSrcReg))
for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid();
++Units) {
unsigned Unit = *Units;
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index 44175af7f9b6..c3729da0b07b 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -158,7 +158,7 @@ private:
// Return true iff the given register is in the given class.
bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) {
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
return RC->hasSubClassEq(MRI->getRegClass(Reg));
return RC->contains(Reg);
}
@@ -253,7 +253,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (isAnyVecReg(Reg, Partial)) {
RelevantInstr = true;
break;
@@ -566,7 +566,7 @@ unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
CopySrcReg = MI->getOperand(2).getReg();
}
- if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) {
+ if (!Register::isVirtualRegister(CopySrcReg)) {
if (!isScalarVecReg(CopySrcReg))
SwapVector[VecIdx].MentionsPhysVR = 1;
return CopySrcReg;
@@ -601,11 +601,11 @@ void PPCVSXSwapRemoval::formWebs() {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (!isVecReg(Reg) && !isScalarVecReg(Reg))
continue;
- if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (!Register::isVirtualRegister(Reg)) {
if (!(MI->isCopy() && isScalarVecReg(Reg)))
SwapVector[EntryIdx].MentionsPhysVR = 1;
continue;
@@ -667,7 +667,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
// than a swap instruction.
else if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) {
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
- unsigned DefReg = MI->getOperand(0).getReg();
+ Register DefReg = MI->getOperand(0).getReg();
// We skip debug instructions in the analysis. (Note that debug
// location information is still maintained by this optimization
@@ -695,9 +695,9 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
// other than a swap instruction.
} else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
- unsigned UseReg = MI->getOperand(0).getReg();
+ Register UseReg = MI->getOperand(0).getReg();
MachineInstr *DefMI = MRI->getVRegDef(UseReg);
- unsigned DefReg = DefMI->getOperand(0).getReg();
+ Register DefReg = DefMI->getOperand(0).getReg();
int DefIdx = SwapMap[DefMI];
if (!SwapVector[DefIdx].IsSwap || SwapVector[DefIdx].IsLoad ||
@@ -756,7 +756,7 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
if (!SwapVector[Repr].WebRejected) {
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
- unsigned DefReg = MI->getOperand(0).getReg();
+ Register DefReg = MI->getOperand(0).getReg();
for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) {
int UseIdx = SwapMap[&UseMI];
@@ -772,7 +772,7 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
if (!SwapVector[Repr].WebRejected) {
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
- unsigned UseReg = MI->getOperand(0).getReg();
+ Register UseReg = MI->getOperand(0).getReg();
MachineInstr *DefMI = MRI->getVRegDef(UseReg);
int DefIdx = SwapMap[DefMI];
SwapVector[DefIdx].WillRemove = 1;
@@ -869,8 +869,8 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
Selector = 3 - Selector;
MI->getOperand(3).setImm(Selector);
- unsigned Reg1 = MI->getOperand(1).getReg();
- unsigned Reg2 = MI->getOperand(2).getReg();
+ Register Reg1 = MI->getOperand(1).getReg();
+ Register Reg2 = MI->getOperand(2).getReg();
MI->getOperand(1).setReg(Reg2);
MI->getOperand(2).setReg(Reg1);
@@ -894,9 +894,9 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
LLVM_DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
LLVM_DEBUG(MI->dump());
- unsigned DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
- unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+ Register NewVReg = MRI->createVirtualRegister(DstRC);
MI->getOperand(0).setReg(NewVReg);
LLVM_DEBUG(dbgs() << " Into: ");
@@ -910,8 +910,8 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
// prior to the swap, and from VSRC to VRRC following the swap.
// Coalescing will usually remove all this mess.
if (DstRC == &PPC::VRRCRegClass) {
- unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
- unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
+ Register VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
+ Register VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
TII->get(PPC::COPY), VSRCTmp1)
diff --git a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 0172c6298772..300ba8dc675c 100644
--- a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@@ -79,7 +80,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
// Helper to emit a combination of LUI, ADDI(W), and SLLI instructions that
// synthesize the desired immedate value into the destination register.
- void emitLoadImm(unsigned DestReg, int64_t Value, MCStreamer &Out);
+ void emitLoadImm(Register DestReg, int64_t Value, MCStreamer &Out);
// Helper to emit a combination of AUIPC and SecondOpcode. Used to implement
// helpers such as emitLoadLocalAddress and emitLoadAddress.
@@ -127,6 +128,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
OperandMatchResultTy parseRegister(OperandVector &Operands,
bool AllowParens = false);
OperandMatchResultTy parseMemOpBaseReg(OperandVector &Operands);
+ OperandMatchResultTy parseAtomicMemOp(OperandVector &Operands);
OperandMatchResultTy parseOperandWithModifier(OperandVector &Operands);
OperandMatchResultTy parseBareSymbol(OperandVector &Operands);
OperandMatchResultTy parseCallSymbol(OperandVector &Operands);
@@ -193,7 +195,7 @@ public:
/// instruction
struct RISCVOperand : public MCParsedAsmOperand {
- enum KindTy {
+ enum class KindTy {
Token,
Register,
Immediate,
@@ -203,7 +205,7 @@ struct RISCVOperand : public MCParsedAsmOperand {
bool IsRV64;
struct RegOp {
- unsigned RegNum;
+ Register RegNum;
};
struct ImmOp {
@@ -235,26 +237,26 @@ public:
StartLoc = o.StartLoc;
EndLoc = o.EndLoc;
switch (Kind) {
- case Register:
+ case KindTy::Register:
Reg = o.Reg;
break;
- case Immediate:
+ case KindTy::Immediate:
Imm = o.Imm;
break;
- case Token:
+ case KindTy::Token:
Tok = o.Tok;
break;
- case SystemRegister:
+ case KindTy::SystemRegister:
SysReg = o.SysReg;
break;
}
}
- bool isToken() const override { return Kind == Token; }
- bool isReg() const override { return Kind == Register; }
- bool isImm() const override { return Kind == Immediate; }
+ bool isToken() const override { return Kind == KindTy::Token; }
+ bool isReg() const override { return Kind == KindTy::Register; }
+ bool isImm() const override { return Kind == KindTy::Immediate; }
bool isMem() const override { return false; }
- bool isSystemRegister() const { return Kind == SystemRegister; }
+ bool isSystemRegister() const { return Kind == KindTy::SystemRegister; }
static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm,
RISCVMCExpr::VariantKind &VK) {
@@ -276,7 +278,7 @@ public:
// modifiers and isShiftedInt<N-1, 1>(Op).
template <int N> bool isBareSimmNLsb0() const {
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
if (!isImm())
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
@@ -292,7 +294,7 @@ public:
bool isBareSymbol() const {
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
// Must be of 'immediate' type but not a constant.
if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
return false;
@@ -302,7 +304,7 @@ public:
bool isCallSymbol() const {
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
// Must be of 'immediate' type but not a constant.
if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
return false;
@@ -313,7 +315,7 @@ public:
bool isTPRelAddSymbol() const {
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
// Must be of 'immediate' type but not a constant.
if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
return false;
@@ -364,7 +366,7 @@ public:
bool isImmXLenLI() const {
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
if (!isImm())
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
@@ -372,13 +374,13 @@ public:
return true;
// Given only Imm, ensuring that the actually specified constant is either
// a signed or unsigned 64-bit number is unfortunately impossible.
- bool IsInRange = isRV64() ? true : isInt<32>(Imm) || isUInt<32>(Imm);
- return IsConstantImm && IsInRange && VK == RISCVMCExpr::VK_RISCV_None;
+ return IsConstantImm && VK == RISCVMCExpr::VK_RISCV_None &&
+ (isRV64() || (isInt<32>(Imm) || isUInt<32>(Imm)));
}
bool isUImmLog2XLen() const {
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
if (!isImm())
return false;
if (!evaluateConstantImm(getImm(), Imm, VK) ||
@@ -389,7 +391,7 @@ public:
bool isUImmLog2XLenNonZero() const {
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
if (!isImm())
return false;
if (!evaluateConstantImm(getImm(), Imm, VK) ||
@@ -402,7 +404,7 @@ public:
bool isUImm5() const {
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
if (!isImm())
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
@@ -411,7 +413,7 @@ public:
bool isUImm5NonZero() const {
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
if (!isImm())
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
@@ -422,7 +424,7 @@ public:
bool isSImm6() const {
if (!isImm())
return false;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
int64_t Imm;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isInt<6>(Imm) &&
@@ -432,7 +434,7 @@ public:
bool isSImm6NonZero() const {
if (!isImm())
return false;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
int64_t Imm;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isInt<6>(Imm) && (Imm != 0) &&
@@ -443,7 +445,7 @@ public:
if (!isImm())
return false;
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && (Imm != 0) &&
(isUInt<5>(Imm) || (Imm >= 0xfffe0 && Imm <= 0xfffff)) &&
@@ -454,7 +456,7 @@ public:
if (!isImm())
return false;
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isShiftedUInt<5, 2>(Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
@@ -464,7 +466,7 @@ public:
if (!isImm())
return false;
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isShiftedUInt<6, 2>(Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
@@ -474,7 +476,7 @@ public:
if (!isImm())
return false;
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isShiftedUInt<5, 3>(Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
@@ -486,7 +488,7 @@ public:
if (!isImm())
return false;
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isShiftedUInt<6, 3>(Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
@@ -496,14 +498,14 @@ public:
if (!isImm())
return false;
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isShiftedUInt<8, 2>(Imm) && (Imm != 0) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isSImm12() const {
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
int64_t Imm;
bool IsValid;
if (!isImm())
@@ -527,14 +529,14 @@ public:
if (!isImm())
return false;
int64_t Imm;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && (Imm != 0) && isShiftedInt<6, 4>(Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isUImm20LUI() const {
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
int64_t Imm;
bool IsValid;
if (!isImm())
@@ -552,7 +554,7 @@ public:
}
bool isUImm20AUIPC() const {
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
int64_t Imm;
bool IsValid;
if (!isImm())
@@ -575,6 +577,15 @@ public:
bool isSImm21Lsb0JAL() const { return isBareSimmNLsb0<21>(); }
+ bool isImmZero() const {
+ if (!isImm())
+ return false;
+ int64_t Imm;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+ bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+ return IsConstantImm && (Imm == 0) && VK == RISCVMCExpr::VK_RISCV_None;
+ }
+
/// getStartLoc - Gets location of the first token of this operand
SMLoc getStartLoc() const override { return StartLoc; }
/// getEndLoc - Gets location of the last token of this operand
@@ -583,38 +594,38 @@ public:
bool isRV64() const { return IsRV64; }
unsigned getReg() const override {
- assert(Kind == Register && "Invalid type access!");
- return Reg.RegNum;
+ assert(Kind == KindTy::Register && "Invalid type access!");
+ return Reg.RegNum.id();
}
StringRef getSysReg() const {
- assert(Kind == SystemRegister && "Invalid access!");
+ assert(Kind == KindTy::SystemRegister && "Invalid access!");
return StringRef(SysReg.Data, SysReg.Length);
}
const MCExpr *getImm() const {
- assert(Kind == Immediate && "Invalid type access!");
+ assert(Kind == KindTy::Immediate && "Invalid type access!");
return Imm.Val;
}
StringRef getToken() const {
- assert(Kind == Token && "Invalid type access!");
+ assert(Kind == KindTy::Token && "Invalid type access!");
return Tok;
}
void print(raw_ostream &OS) const override {
switch (Kind) {
- case Immediate:
+ case KindTy::Immediate:
OS << *getImm();
break;
- case Register:
+ case KindTy::Register:
OS << "<register x";
OS << getReg() << ">";
break;
- case Token:
+ case KindTy::Token:
OS << "'" << getToken() << "'";
break;
- case SystemRegister:
+ case KindTy::SystemRegister:
OS << "<sysreg: " << getSysReg() << '>';
break;
}
@@ -622,7 +633,7 @@ public:
static std::unique_ptr<RISCVOperand> createToken(StringRef Str, SMLoc S,
bool IsRV64) {
- auto Op = make_unique<RISCVOperand>(Token);
+ auto Op = std::make_unique<RISCVOperand>(KindTy::Token);
Op->Tok = Str;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -632,7 +643,7 @@ public:
static std::unique_ptr<RISCVOperand> createReg(unsigned RegNo, SMLoc S,
SMLoc E, bool IsRV64) {
- auto Op = make_unique<RISCVOperand>(Register);
+ auto Op = std::make_unique<RISCVOperand>(KindTy::Register);
Op->Reg.RegNum = RegNo;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -642,7 +653,7 @@ public:
static std::unique_ptr<RISCVOperand> createImm(const MCExpr *Val, SMLoc S,
SMLoc E, bool IsRV64) {
- auto Op = make_unique<RISCVOperand>(Immediate);
+ auto Op = std::make_unique<RISCVOperand>(KindTy::Immediate);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -652,7 +663,7 @@ public:
static std::unique_ptr<RISCVOperand>
createSysReg(StringRef Str, SMLoc S, unsigned Encoding, bool IsRV64) {
- auto Op = make_unique<RISCVOperand>(SystemRegister);
+ auto Op = std::make_unique<RISCVOperand>(KindTy::SystemRegister);
Op->SysReg.Data = Str.data();
Op->SysReg.Length = Str.size();
Op->SysReg.Encoding = Encoding;
@@ -664,7 +675,7 @@ public:
void addExpr(MCInst &Inst, const MCExpr *Expr) const {
assert(Expr && "Expr shouldn't be null!");
int64_t Imm = 0;
- RISCVMCExpr::VariantKind VK;
+ RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
bool IsConstant = evaluateConstantImm(Expr, Imm, VK);
if (IsConstant)
@@ -730,46 +741,9 @@ public:
#define GET_MATCHER_IMPLEMENTATION
#include "RISCVGenAsmMatcher.inc"
-// Return the matching FPR64 register for the given FPR32.
-// FIXME: Ideally this function could be removed in favour of using
-// information from TableGen.
-unsigned convertFPR32ToFPR64(unsigned Reg) {
- switch (Reg) {
- default:
- llvm_unreachable("Not a recognised FPR32 register");
- case RISCV::F0_32: return RISCV::F0_64;
- case RISCV::F1_32: return RISCV::F1_64;
- case RISCV::F2_32: return RISCV::F2_64;
- case RISCV::F3_32: return RISCV::F3_64;
- case RISCV::F4_32: return RISCV::F4_64;
- case RISCV::F5_32: return RISCV::F5_64;
- case RISCV::F6_32: return RISCV::F6_64;
- case RISCV::F7_32: return RISCV::F7_64;
- case RISCV::F8_32: return RISCV::F8_64;
- case RISCV::F9_32: return RISCV::F9_64;
- case RISCV::F10_32: return RISCV::F10_64;
- case RISCV::F11_32: return RISCV::F11_64;
- case RISCV::F12_32: return RISCV::F12_64;
- case RISCV::F13_32: return RISCV::F13_64;
- case RISCV::F14_32: return RISCV::F14_64;
- case RISCV::F15_32: return RISCV::F15_64;
- case RISCV::F16_32: return RISCV::F16_64;
- case RISCV::F17_32: return RISCV::F17_64;
- case RISCV::F18_32: return RISCV::F18_64;
- case RISCV::F19_32: return RISCV::F19_64;
- case RISCV::F20_32: return RISCV::F20_64;
- case RISCV::F21_32: return RISCV::F21_64;
- case RISCV::F22_32: return RISCV::F22_64;
- case RISCV::F23_32: return RISCV::F23_64;
- case RISCV::F24_32: return RISCV::F24_64;
- case RISCV::F25_32: return RISCV::F25_64;
- case RISCV::F26_32: return RISCV::F26_64;
- case RISCV::F27_32: return RISCV::F27_64;
- case RISCV::F28_32: return RISCV::F28_64;
- case RISCV::F29_32: return RISCV::F29_64;
- case RISCV::F30_32: return RISCV::F30_64;
- case RISCV::F31_32: return RISCV::F31_64;
- }
+static Register convertFPR64ToFPR32(Register Reg) {
+ assert(Reg >= RISCV::F0_D && Reg <= RISCV::F31_D && "Invalid register");
+ return Reg - RISCV::F0_D + RISCV::F0_F;
}
unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
@@ -778,17 +752,17 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
if (!Op.isReg())
return Match_InvalidOperand;
- unsigned Reg = Op.getReg();
- bool IsRegFPR32 =
- RISCVMCRegisterClasses[RISCV::FPR32RegClassID].contains(Reg);
- bool IsRegFPR32C =
- RISCVMCRegisterClasses[RISCV::FPR32CRegClassID].contains(Reg);
+ Register Reg = Op.getReg();
+ bool IsRegFPR64 =
+ RISCVMCRegisterClasses[RISCV::FPR64RegClassID].contains(Reg);
+ bool IsRegFPR64C =
+ RISCVMCRegisterClasses[RISCV::FPR64CRegClassID].contains(Reg);
// As the parser couldn't differentiate an FPR32 from an FPR64, coerce the
- // register from FPR32 to FPR64 or FPR32C to FPR64C if necessary.
- if ((IsRegFPR32 && Kind == MCK_FPR64) ||
- (IsRegFPR32C && Kind == MCK_FPR64C)) {
- Op.Reg.RegNum = convertFPR32ToFPR64(Reg);
+ // register from FPR64 to FPR32 or FPR64C to FPR32C if necessary.
+ if ((IsRegFPR64 && Kind == MCK_FPR32) ||
+ (IsRegFPR64C && Kind == MCK_FPR32C)) {
+ Op.Reg.RegNum = convertFPR64ToFPR32(Reg);
return Match_Success;
}
return Match_InvalidOperand;
@@ -853,6 +827,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return generateImmOutOfRangeError(Operands, ErrorInfo,
std::numeric_limits<int32_t>::min(),
std::numeric_limits<uint32_t>::max());
+ case Match_InvalidImmZero: {
+ SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+ return Error(ErrorLoc, "immediate must be zero");
+ }
case Match_InvalidUImmLog2XLen:
if (isRV64())
return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 6) - 1);
@@ -968,14 +946,19 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// alternative ABI names), setting RegNo to the matching register. Upon
// failure, returns true and sets RegNo to 0. If IsRV32E then registers
// x16-x31 will be rejected.
-static bool matchRegisterNameHelper(bool IsRV32E, unsigned &RegNo,
+static bool matchRegisterNameHelper(bool IsRV32E, Register &RegNo,
StringRef Name) {
RegNo = MatchRegisterName(Name);
- if (RegNo == 0)
+ // The 32- and 64-bit FPRs have the same asm name. Check that the initial
+ // match always matches the 64-bit variant, and not the 32-bit one.
+ assert(!(RegNo >= RISCV::F0_F && RegNo <= RISCV::F31_F));
+ // The default FPR register class is based on the tablegen enum ordering.
+ static_assert(RISCV::F0_D < RISCV::F0_F, "FPR matching must be updated");
+ if (RegNo == RISCV::NoRegister)
RegNo = MatchRegisterAltName(Name);
if (IsRV32E && RegNo >= RISCV::X16 && RegNo <= RISCV::X31)
- RegNo = 0;
- return RegNo == 0;
+ RegNo = RISCV::NoRegister;
+ return RegNo == RISCV::NoRegister;
}
bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
@@ -986,7 +969,7 @@ bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
RegNo = 0;
StringRef Name = getLexer().getTok().getIdentifier();
- if (matchRegisterNameHelper(isRV32E(), RegNo, Name))
+ if (matchRegisterNameHelper(isRV32E(), (Register&)RegNo, Name))
return Error(StartLoc, "invalid register name");
getParser().Lex(); // Eat identifier token.
@@ -1018,10 +1001,10 @@ OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands,
return MatchOperand_NoMatch;
case AsmToken::Identifier:
StringRef Name = getLexer().getTok().getIdentifier();
- unsigned RegNo;
+ Register RegNo;
matchRegisterNameHelper(isRV32E(), RegNo, Name);
- if (RegNo == 0) {
+ if (RegNo == RISCV::NoRegister) {
if (HadParens)
getLexer().UnLex(LParen);
return MatchOperand_NoMatch;
@@ -1208,6 +1191,24 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) {
Res = V;
} else
Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+
+ MCBinaryExpr::Opcode Opcode;
+ switch (getLexer().getKind()) {
+ default:
+ Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
+ return MatchOperand_Success;
+ case AsmToken::Plus:
+ Opcode = MCBinaryExpr::Add;
+ break;
+ case AsmToken::Minus:
+ Opcode = MCBinaryExpr::Sub;
+ break;
+ }
+
+ const MCExpr *Expr;
+ if (getParser().parseExpression(Expr))
+ return MatchOperand_ParseFail;
+ Res = MCBinaryExpr::create(Opcode, Res, Expr, getContext());
Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
return MatchOperand_Success;
}
@@ -1282,6 +1283,73 @@ RISCVAsmParser::parseMemOpBaseReg(OperandVector &Operands) {
return MatchOperand_Success;
}
+OperandMatchResultTy RISCVAsmParser::parseAtomicMemOp(OperandVector &Operands) {
+ // Atomic operations such as lr.w, sc.w, and amo*.w accept a "memory operand"
+ // as one of their register operands, such as `(a0)`. This just denotes that
+ // the register (in this case `a0`) contains a memory address.
+ //
+ // Normally, we would be able to parse these by putting the parens into the
+ // instruction string. However, GNU as also accepts a zero-offset memory
+ // operand (such as `0(a0)`), and ignores the 0. Normally this would be parsed
+ // with parseImmediate followed by parseMemOpBaseReg, but these instructions
+ // do not accept an immediate operand, and we do not want to add a "dummy"
+ // operand that is silently dropped.
+ //
+ // Instead, we use this custom parser. This will: allow (and discard) an
+ // offset if it is zero; require (and discard) parentheses; and add only the
+ // parsed register operand to `Operands`.
+ //
+ // These operands are printed with RISCVInstPrinter::printAtomicMemOp, which
+ // will only print the register surrounded by parentheses (which GNU as also
+ // uses as its canonical representation for these operands).
+ std::unique_ptr<RISCVOperand> OptionalImmOp;
+
+ if (getLexer().isNot(AsmToken::LParen)) {
+ // Parse an Integer token. We do not accept arbritrary constant expressions
+ // in the offset field (because they may include parens, which complicates
+ // parsing a lot).
+ int64_t ImmVal;
+ SMLoc ImmStart = getLoc();
+ if (getParser().parseIntToken(ImmVal,
+ "expected '(' or optional integer offset"))
+ return MatchOperand_ParseFail;
+
+ // Create a RISCVOperand for checking later (so the error messages are
+ // nicer), but we don't add it to Operands.
+ SMLoc ImmEnd = getLoc();
+ OptionalImmOp =
+ RISCVOperand::createImm(MCConstantExpr::create(ImmVal, getContext()),
+ ImmStart, ImmEnd, isRV64());
+ }
+
+ if (getLexer().isNot(AsmToken::LParen)) {
+ Error(getLoc(), OptionalImmOp ? "expected '(' after optional integer offset"
+ : "expected '(' or optional integer offset");
+ return MatchOperand_ParseFail;
+ }
+ getParser().Lex(); // Eat '('
+
+ if (parseRegister(Operands) != MatchOperand_Success) {
+ Error(getLoc(), "expected register");
+ return MatchOperand_ParseFail;
+ }
+
+ if (getLexer().isNot(AsmToken::RParen)) {
+ Error(getLoc(), "expected ')'");
+ return MatchOperand_ParseFail;
+ }
+ getParser().Lex(); // Eat ')'
+
+ // Deferred Handling of non-zero offsets. This makes the error messages nicer.
+ if (OptionalImmOp && !OptionalImmOp->isImmZero()) {
+ Error(OptionalImmOp->getStartLoc(), "optional integer offset must be 0",
+ SMRange(OptionalImmOp->getStartLoc(), OptionalImmOp->getEndLoc()));
+ return MatchOperand_ParseFail;
+ }
+
+ return MatchOperand_Success;
+}
+
/// Looks at a token type and creates the relevant operand from this
/// information, adding to Operands. If operand was parsed, returns false, else
/// true.
@@ -1523,12 +1591,12 @@ void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
S.EmitInstruction((Res ? CInst : Inst), getSTI());
}
-void RISCVAsmParser::emitLoadImm(unsigned DestReg, int64_t Value,
+void RISCVAsmParser::emitLoadImm(Register DestReg, int64_t Value,
MCStreamer &Out) {
RISCVMatInt::InstSeq Seq;
RISCVMatInt::generateInstSeq(Value, isRV64(), Seq);
- unsigned SrcReg = RISCV::X0;
+ Register SrcReg = RISCV::X0;
for (RISCVMatInt::Inst &Inst : Seq) {
if (Inst.Opc == RISCV::LUI) {
emitToStreamer(
@@ -1682,7 +1750,7 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
default:
break;
case RISCV::PseudoLI: {
- unsigned Reg = Inst.getOperand(0).getReg();
+ Register Reg = Inst.getOperand(0).getReg();
const MCOperand &Op1 = Inst.getOperand(1);
if (Op1.isExpr()) {
// We must have li reg, %lo(sym) or li reg, %pcrel_lo(sym) or similar.
diff --git a/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 36200c03f703..15943ba42156 100644
--- a/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -13,6 +13,7 @@
#include "MCTargetDesc/RISCVMCTargetDesc.h"
#include "TargetInfo/RISCVTargetInfo.h"
#include "Utils/RISCVBaseInfo.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
@@ -56,17 +57,6 @@ extern "C" void LLVMInitializeRISCVDisassembler() {
createRISCVDisassembler);
}
-static const unsigned GPRDecoderTable[] = {
- RISCV::X0, RISCV::X1, RISCV::X2, RISCV::X3,
- RISCV::X4, RISCV::X5, RISCV::X6, RISCV::X7,
- RISCV::X8, RISCV::X9, RISCV::X10, RISCV::X11,
- RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15,
- RISCV::X16, RISCV::X17, RISCV::X18, RISCV::X19,
- RISCV::X20, RISCV::X21, RISCV::X22, RISCV::X23,
- RISCV::X24, RISCV::X25, RISCV::X26, RISCV::X27,
- RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31
-};
-
static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
@@ -76,38 +66,21 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint64_t RegNo,
.getFeatureBits();
bool IsRV32E = FeatureBits[RISCV::FeatureRV32E];
- if (RegNo > array_lengthof(GPRDecoderTable) || (IsRV32E && RegNo > 15))
+ if (RegNo >= 32 || (IsRV32E && RegNo >= 16))
return MCDisassembler::Fail;
- // We must define our own mapping from RegNo to register identifier.
- // Accessing index RegNo in the register class will work in the case that
- // registers were added in ascending order, but not in general.
- unsigned Reg = GPRDecoderTable[RegNo];
+ Register Reg = RISCV::X0 + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
-static const unsigned FPR32DecoderTable[] = {
- RISCV::F0_32, RISCV::F1_32, RISCV::F2_32, RISCV::F3_32,
- RISCV::F4_32, RISCV::F5_32, RISCV::F6_32, RISCV::F7_32,
- RISCV::F8_32, RISCV::F9_32, RISCV::F10_32, RISCV::F11_32,
- RISCV::F12_32, RISCV::F13_32, RISCV::F14_32, RISCV::F15_32,
- RISCV::F16_32, RISCV::F17_32, RISCV::F18_32, RISCV::F19_32,
- RISCV::F20_32, RISCV::F21_32, RISCV::F22_32, RISCV::F23_32,
- RISCV::F24_32, RISCV::F25_32, RISCV::F26_32, RISCV::F27_32,
- RISCV::F28_32, RISCV::F29_32, RISCV::F30_32, RISCV::F31_32
-};
-
static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
- if (RegNo > array_lengthof(FPR32DecoderTable))
+ if (RegNo >= 32)
return MCDisassembler::Fail;
- // We must define our own mapping from RegNo to register identifier.
- // Accessing index RegNo in the register class will work in the case that
- // registers were added in ascending order, but not in general.
- unsigned Reg = FPR32DecoderTable[RegNo];
+ Register Reg = RISCV::F0_F + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
@@ -115,35 +88,21 @@ static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, uint64_t RegNo,
static DecodeStatus DecodeFPR32CRegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
- if (RegNo > 8) {
+ if (RegNo >= 8) {
return MCDisassembler::Fail;
}
- unsigned Reg = FPR32DecoderTable[RegNo + 8];
+ Register Reg = RISCV::F8_F + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
-static const unsigned FPR64DecoderTable[] = {
- RISCV::F0_64, RISCV::F1_64, RISCV::F2_64, RISCV::F3_64,
- RISCV::F4_64, RISCV::F5_64, RISCV::F6_64, RISCV::F7_64,
- RISCV::F8_64, RISCV::F9_64, RISCV::F10_64, RISCV::F11_64,
- RISCV::F12_64, RISCV::F13_64, RISCV::F14_64, RISCV::F15_64,
- RISCV::F16_64, RISCV::F17_64, RISCV::F18_64, RISCV::F19_64,
- RISCV::F20_64, RISCV::F21_64, RISCV::F22_64, RISCV::F23_64,
- RISCV::F24_64, RISCV::F25_64, RISCV::F26_64, RISCV::F27_64,
- RISCV::F28_64, RISCV::F29_64, RISCV::F30_64, RISCV::F31_64
-};
-
static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
- if (RegNo > array_lengthof(FPR64DecoderTable))
+ if (RegNo >= 32)
return MCDisassembler::Fail;
- // We must define our own mapping from RegNo to register identifier.
- // Accessing index RegNo in the register class will work in the case that
- // registers were added in ascending order, but not in general.
- unsigned Reg = FPR64DecoderTable[RegNo];
+ Register Reg = RISCV::F0_D + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
@@ -151,10 +110,10 @@ static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
static DecodeStatus DecodeFPR64CRegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
- if (RegNo > 8) {
+ if (RegNo >= 8) {
return MCDisassembler::Fail;
}
- unsigned Reg = FPR64DecoderTable[RegNo + 8];
+ Register Reg = RISCV::F8_D + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
@@ -182,10 +141,10 @@ static DecodeStatus DecodeGPRNoX0X2RegisterClass(MCInst &Inst, uint64_t RegNo,
static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const void *Decoder) {
- if (RegNo > 8)
+ if (RegNo >= 8)
return MCDisassembler::Fail;
- unsigned Reg = GPRDecoderTable[RegNo + 8];
+ Register Reg = RISCV::X8 + RegNo;
Inst.addOperand(MCOperand::createReg(Reg));
return MCDisassembler::Success;
}
@@ -279,8 +238,80 @@ static DecodeStatus decodeFRMArg(MCInst &Inst, uint64_t Imm,
return MCDisassembler::Success;
}
+static DecodeStatus decodeRVCInstrSImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeRVCInstrRdRs1UImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
+static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
#include "RISCVGenDisassemblerTables.inc"
+static DecodeStatus decodeRVCInstrSImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ uint64_t SImm6 =
+ fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
+ DecodeStatus Result = decodeSImmOperand<6>(Inst, SImm6, Address, Decoder);
+ (void)Result;
+ assert(Result == MCDisassembler::Success && "Invalid immediate");
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ DecodeGPRRegisterClass(Inst, 0, Address, Decoder);
+ uint64_t SImm6 =
+ fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
+ DecodeStatus Result = decodeSImmOperand<6>(Inst, SImm6, Address, Decoder);
+ (void)Result;
+ assert(Result == MCDisassembler::Success && "Invalid immediate");
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRVCInstrRdRs1UImm(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ DecodeGPRRegisterClass(Inst, 0, Address, Decoder);
+ Inst.addOperand(Inst.getOperand(0));
+ uint64_t UImm6 =
+ fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
+ DecodeStatus Result = decodeUImmOperand<6>(Inst, UImm6, Address, Decoder);
+ (void)Result;
+ assert(Result == MCDisassembler::Success && "Invalid immediate");
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, unsigned Insn,
+ uint64_t Address, const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(Insn, 7, 5);
+ unsigned Rs2 = fieldFromInstruction(Insn, 2, 5);
+ DecodeGPRRegisterClass(Inst, Rd, Address, Decoder);
+ DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder);
+ return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ unsigned Rd = fieldFromInstruction(Insn, 7, 5);
+ unsigned Rs2 = fieldFromInstruction(Insn, 2, 5);
+ DecodeGPRRegisterClass(Inst, Rd, Address, Decoder);
+ Inst.addOperand(Inst.getOperand(0));
+ DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder);
+ return MCDisassembler::Success;
+}
+
DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes,
uint64_t Address,
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index ee5f760ebcb0..f6b727ae37c7 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -30,9 +30,16 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
const MCValue &Target) {
bool ShouldForce = false;
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default:
break;
+ case FK_Data_1:
+ case FK_Data_2:
+ case FK_Data_4:
+ case FK_Data_8:
+ if (Target.isAbsolute())
+ return false;
+ break;
case RISCV::fixup_riscv_got_hi20:
case RISCV::fixup_riscv_tls_got_hi20:
case RISCV::fixup_riscv_tls_gd_hi20:
@@ -48,7 +55,7 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
return false;
}
- switch ((unsigned)T->getKind()) {
+ switch (T->getTargetKind()) {
default:
llvm_unreachable("Unexpected fixup kind for pcrel_lo12");
break;
@@ -83,7 +90,7 @@ bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
return true;
int64_t Offset = int64_t(Value);
- switch ((unsigned)Fixup.getKind()) {
+ switch (Fixup.getTargetKind()) {
default:
return false;
case RISCV::fixup_riscv_rvc_branch:
@@ -174,8 +181,7 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
MCContext &Ctx) {
- unsigned Kind = Fixup.getKind();
- switch (Kind) {
+ switch (Fixup.getTargetKind()) {
default:
llvm_unreachable("Unknown fixup kind!");
case RISCV::fixup_riscv_got_hi20:
@@ -186,6 +192,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
case FK_Data_2:
case FK_Data_4:
case FK_Data_8:
+ case FK_Data_6b:
return Value;
case RISCV::fixup_riscv_lo12_i:
case RISCV::fixup_riscv_pcrel_lo12_i:
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index 3ccbc86d2619..cab2bbcb81bc 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/RISCVFixupKinds.h"
+#include "MCTargetDesc/RISCVMCExpr.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCFixup.h"
@@ -47,8 +48,9 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
const MCValue &Target,
const MCFixup &Fixup,
bool IsPCRel) const {
+ const MCExpr *Expr = Fixup.getValue();
// Determine the type of the relocation
- unsigned Kind = Fixup.getKind();
+ unsigned Kind = Fixup.getTargetKind();
if (IsPCRel) {
switch (Kind) {
default:
@@ -87,6 +89,9 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
default:
llvm_unreachable("invalid fixup kind!");
case FK_Data_4:
+ if (Expr->getKind() == MCExpr::Target &&
+ cast<RISCVMCExpr>(Expr)->getKind() == RISCVMCExpr::VK_RISCV_32_PCREL)
+ return ELF::R_RISCV_32_PCREL;
return ELF::R_RISCV_32;
case FK_Data_8:
return ELF::R_RISCV_64;
@@ -98,6 +103,8 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_RISCV_ADD32;
case FK_Data_Add_8:
return ELF::R_RISCV_ADD64;
+ case FK_Data_Add_6b:
+ return ELF::R_RISCV_SET6;
case FK_Data_Sub_1:
return ELF::R_RISCV_SUB8;
case FK_Data_Sub_2:
@@ -106,6 +113,8 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_RISCV_SUB32;
case FK_Data_Sub_8:
return ELF::R_RISCV_SUB64;
+ case FK_Data_Sub_6b:
+ return ELF::R_RISCV_SUB6;
case RISCV::fixup_riscv_hi20:
return ELF::R_RISCV_HI20;
case RISCV::fixup_riscv_lo12_i:
@@ -129,5 +138,5 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
std::unique_ptr<MCObjectTargetWriter>
llvm::createRISCVELFObjectWriter(uint8_t OSABI, bool Is64Bit) {
- return llvm::make_unique<RISCVELFObjectWriter>(OSABI, Is64Bit);
+ return std::make_unique<RISCVELFObjectWriter>(OSABI, Is64Bit);
}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index fe37b70811d8..8b5fe6dd8252 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -39,6 +39,30 @@ static cl::opt<bool>
cl::desc("Disable the emission of assembler pseudo instructions"),
cl::init(false), cl::Hidden);
+static cl::opt<bool>
+ ArchRegNames("riscv-arch-reg-names",
+ cl::desc("Print architectural register names rather than the "
+ "ABI names (such as x2 instead of sp)"),
+ cl::init(false), cl::Hidden);
+
+// The command-line flags above are used by llvm-mc and llc. They can be used by
+// `llvm-objdump`, but we override their values here to handle options passed to
+// `llvm-objdump` with `-M` (which matches GNU objdump). There did not seem to
+// be an easier way to allow these options in all these tools, without doing it
+// this way.
+bool RISCVInstPrinter::applyTargetSpecificCLOption(StringRef Opt) {
+ if (Opt == "no-aliases") {
+ NoAliases = true;
+ return true;
+ }
+ if (Opt == "numeric") {
+ ArchRegNames = true;
+ return true;
+ }
+
+ return false;
+}
+
void RISCVInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
StringRef Annot, const MCSubtargetInfo &STI) {
bool Res = false;
@@ -112,3 +136,20 @@ void RISCVInstPrinter::printFRMArg(const MCInst *MI, unsigned OpNo,
static_cast<RISCVFPRndMode::RoundingMode>(MI->getOperand(OpNo).getImm());
O << RISCVFPRndMode::roundingModeToString(FRMArg);
}
+
+void RISCVInstPrinter::printAtomicMemOp(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNo);
+
+ assert(MO.isReg() && "printAtomicMemOp can only print register operands");
+ O << "(";
+ printRegName(O, MO.getReg());
+ O << ")";
+ return;
+}
+
+const char *RISCVInstPrinter::getRegisterName(unsigned RegNo) {
+ return getRegisterName(RegNo, ArchRegNames ? RISCV::NoRegAltName
+ : RISCV::ABIRegAltName);
+}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
index 5ca1d3fa20fe..189d72626f3e 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
@@ -25,6 +25,8 @@ public:
const MCRegisterInfo &MRI)
: MCInstPrinter(MAI, MII, MRI) {}
+ bool applyTargetSpecificCLOption(StringRef Opt) override;
+
void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
const MCSubtargetInfo &STI) override;
void printRegName(raw_ostream &O, unsigned RegNo) const override;
@@ -37,6 +39,8 @@ public:
const MCSubtargetInfo &STI, raw_ostream &O);
void printFRMArg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printAtomicMemOp(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
// Autogenerated by tblgen.
void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
@@ -46,8 +50,8 @@ public:
void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
unsigned PrintMethodIdx,
const MCSubtargetInfo &STI, raw_ostream &O);
- static const char *getRegisterName(unsigned RegNo,
- unsigned AltIdx = RISCV::ABIRegAltName);
+ static const char *getRegisterName(unsigned RegNo);
+ static const char *getRegisterName(unsigned RegNo, unsigned AltIdx);
};
} // namespace llvm
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
index 983629692883..089a2def4c21 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -11,7 +11,10 @@
//===----------------------------------------------------------------------===//
#include "RISCVMCAsmInfo.h"
+#include "MCTargetDesc/RISCVMCExpr.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCStreamer.h"
using namespace llvm;
void RISCVMCAsmInfo::anchor() {}
@@ -25,3 +28,20 @@ RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) {
Data16bitsDirective = "\t.half\t";
Data32bitsDirective = "\t.word\t";
}
+
+const MCExpr *RISCVMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
+ unsigned Encoding,
+ MCStreamer &Streamer) const {
+ if (!(Encoding & dwarf::DW_EH_PE_pcrel))
+ return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
+
+ // The default symbol subtraction results in an ADD/SUB relocation pair.
+ // Processing this relocation pair is problematic when linker relaxation is
+ // enabled, so we follow binutils in using the R_RISCV_32_PCREL relocation
+ // for the FDE initial location.
+ MCContext &Ctx = Streamer.getContext();
+ const MCExpr *ME =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+ assert(Encoding & dwarf::DW_EH_PE_sdata4 && "Unexpected encoding");
+ return RISCVMCExpr::create(ME, RISCVMCExpr::VK_RISCV_32_PCREL, Ctx);
+}
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
index 043fdb7c08c0..6824baf699aa 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
@@ -23,6 +23,9 @@ class RISCVMCAsmInfo : public MCAsmInfoELF {
public:
explicit RISCVMCAsmInfo(const Triple &TargetTriple);
+
+ const MCExpr *getExprForFDESymbol(const MCSymbol *Sym, unsigned Encoding,
+ MCStreamer &Streamer) const override;
};
} // namespace llvm
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 0fc775f63ed4..de99960848a5 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -15,6 +15,7 @@
#include "MCTargetDesc/RISCVMCTargetDesc.h"
#include "Utils/RISCVBaseInfo.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
@@ -100,7 +101,7 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS,
const MCSubtargetInfo &STI) const {
MCInst TmpInst;
MCOperand Func;
- unsigned Ra;
+ Register Ra;
if (MI.getOpcode() == RISCV::PseudoTAIL) {
Func = MI.getOperand(0);
Ra = RISCV::X6;
@@ -266,6 +267,7 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
switch (RVExpr->getKind()) {
case RISCVMCExpr::VK_RISCV_None:
case RISCVMCExpr::VK_RISCV_Invalid:
+ case RISCVMCExpr::VK_RISCV_32_PCREL:
llvm_unreachable("Unhandled fixup kind!");
case RISCVMCExpr::VK_RISCV_TPREL_ADD:
// tprel_add is only used to indicate that a relocation should be emitted
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
index b5a292dc1b1a..921df376f3df 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
@@ -36,6 +36,7 @@ public:
VK_RISCV_TLS_GD_HI,
VK_RISCV_CALL,
VK_RISCV_CALL_PLT,
+ VK_RISCV_32_PCREL,
VK_RISCV_Invalid
};
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index bc45262ab2de..5a4c86e48f1e 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -16,7 +16,9 @@
#include "RISCVMCAsmInfo.h"
#include "RISCVTargetStreamer.h"
#include "TargetInfo/RISCVTargetInfo.h"
+#include "Utils/RISCVBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -52,7 +54,7 @@ static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
const Triple &TT) {
MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
- unsigned SP = MRI.getDwarfRegNum(RISCV::X2, true);
+ Register SP = MRI.getDwarfRegNum(RISCV::X2, true);
MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0);
MAI->addInitialFrameState(Inst);
diff --git a/lib/Target/RISCV/RISCV.h b/lib/Target/RISCV/RISCV.h
index 834a1d171143..f23f742a4782 100644
--- a/lib/Target/RISCV/RISCV.h
+++ b/lib/Target/RISCV/RISCV.h
@@ -18,9 +18,12 @@
#include "llvm/Target/TargetMachine.h"
namespace llvm {
+class RISCVRegisterBankInfo;
+class RISCVSubtarget;
class RISCVTargetMachine;
class AsmPrinter;
class FunctionPass;
+class InstructionSelector;
class MCInst;
class MCOperand;
class MachineInstr;
@@ -39,6 +42,10 @@ void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &);
FunctionPass *createRISCVExpandPseudoPass();
void initializeRISCVExpandPseudoPass(PassRegistry &);
+
+InstructionSelector *createRISCVInstructionSelector(const RISCVTargetMachine &,
+ RISCVSubtarget &,
+ RISCVRegisterBankInfo &);
}
#endif
diff --git a/lib/Target/RISCV/RISCV.td b/lib/Target/RISCV/RISCV.td
index e19b70b8e709..46530a8f74a8 100644
--- a/lib/Target/RISCV/RISCV.td
+++ b/lib/Target/RISCV/RISCV.td
@@ -43,6 +43,11 @@ def FeatureStdExtC
def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">,
AssemblerPredicate<"FeatureStdExtC">;
+def FeatureRVCHints
+ : SubtargetFeature<"rvc-hints", "EnableRVCHintInstrs", "true",
+ "Enable RVC Hint Instructions.">;
+def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">,
+ AssemblerPredicate<"FeatureRVCHints">;
def Feature64Bit
: SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">;
@@ -77,14 +82,16 @@ include "RISCVSystemOperands.td"
include "RISCVRegisterInfo.td"
include "RISCVCallingConv.td"
include "RISCVInstrInfo.td"
+include "RISCVRegisterBanks.td"
//===----------------------------------------------------------------------===//
// RISC-V processors supported.
//===----------------------------------------------------------------------===//
-def : ProcessorModel<"generic-rv32", NoSchedModel, []>;
+def : ProcessorModel<"generic-rv32", NoSchedModel, [FeatureRVCHints]>;
-def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>;
+def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit,
+ FeatureRVCHints]>;
//===----------------------------------------------------------------------===//
// Define the RISC-V target.
diff --git a/lib/Target/RISCV/RISCVCallLowering.cpp b/lib/Target/RISCV/RISCVCallLowering.cpp
new file mode 100644
index 000000000000..c63a84739c4a
--- /dev/null
+++ b/lib/Target/RISCV/RISCVCallLowering.cpp
@@ -0,0 +1,50 @@
+//===-- RISCVCallLowering.cpp - Call lowering -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVCallLowering.h"
+#include "RISCVISelLowering.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+
+using namespace llvm;
+
+RISCVCallLowering::RISCVCallLowering(const RISCVTargetLowering &TLI)
+ : CallLowering(&TLI) {}
+
+bool RISCVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val,
+ ArrayRef<Register> VRegs) const {
+
+ MachineInstrBuilder Ret = MIRBuilder.buildInstrNoInsert(RISCV::PseudoRET);
+
+ if (Val != nullptr) {
+ return false;
+ }
+ MIRBuilder.insertInstr(Ret);
+ return true;
+}
+
+bool RISCVCallLowering::lowerFormalArguments(
+ MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const {
+
+ if (F.arg_empty())
+ return true;
+
+ return false;
+}
+
+bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const {
+ return false;
+}
diff --git a/lib/Target/RISCV/RISCVCallLowering.h b/lib/Target/RISCV/RISCVCallLowering.h
new file mode 100644
index 000000000000..7ce074a61f0a
--- /dev/null
+++ b/lib/Target/RISCV/RISCVCallLowering.h
@@ -0,0 +1,42 @@
+//===-- RISCVCallLowering.h - Call lowering ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVCALLLOWERING_H
+#define LLVM_LIB_TARGET_RISCV_RISCVCALLLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+namespace llvm {
+
+class RISCVTargetLowering;
+
+class RISCVCallLowering : public CallLowering {
+
+public:
+ RISCVCallLowering(const RISCVTargetLowering &TLI);
+
+ bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+ ArrayRef<Register> VRegs) const override;
+
+ bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs) const override;
+
+ bool lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_RISCV_RISCVCALLLOWERING_H
diff --git a/lib/Target/RISCV/RISCVCallingConv.td b/lib/Target/RISCV/RISCVCallingConv.td
index db13e6e8beca..025454f8fcca 100644
--- a/lib/Target/RISCV/RISCVCallingConv.td
+++ b/lib/Target/RISCV/RISCVCallingConv.td
@@ -18,11 +18,11 @@ def CSR_ILP32_LP64
def CSR_ILP32F_LP64F
: CalleeSavedRegs<(add CSR_ILP32_LP64,
- F8_32, F9_32, (sequence "F%u_32", 18, 27))>;
+ F8_F, F9_F, (sequence "F%u_F", 18, 27))>;
def CSR_ILP32D_LP64D
: CalleeSavedRegs<(add CSR_ILP32_LP64,
- F8_64, F9_64, (sequence "F%u_64", 18, 27))>;
+ F8_D, F9_D, (sequence "F%u_D", 18, 27))>;
// Needed for implementation of RISCVRegisterInfo::getNoPreservedMask()
def CSR_NoRegs : CalleeSavedRegs<(add)>;
@@ -43,12 +43,12 @@ def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add X1,
(sequence "X%u", 12, 17),
(sequence "X%u", 18, 27),
(sequence "X%u", 28, 31),
- (sequence "F%u_32", 0, 7),
- (sequence "F%u_32", 10, 11),
- (sequence "F%u_32", 12, 17),
- (sequence "F%u_32", 28, 31),
- (sequence "F%u_32", 8, 9),
- (sequence "F%u_32", 18, 27))>;
+ (sequence "F%u_F", 0, 7),
+ (sequence "F%u_F", 10, 11),
+ (sequence "F%u_F", 12, 17),
+ (sequence "F%u_F", 28, 31),
+ (sequence "F%u_F", 8, 9),
+ (sequence "F%u_F", 18, 27))>;
// Same as CSR_Interrupt, but including all 64-bit FP registers.
def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add X1,
@@ -57,9 +57,9 @@ def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add X1,
(sequence "X%u", 12, 17),
(sequence "X%u", 18, 27),
(sequence "X%u", 28, 31),
- (sequence "F%u_64", 0, 7),
- (sequence "F%u_64", 10, 11),
- (sequence "F%u_64", 12, 17),
- (sequence "F%u_64", 28, 31),
- (sequence "F%u_64", 8, 9),
- (sequence "F%u_64", 18, 27))>;
+ (sequence "F%u_D", 0, 7),
+ (sequence "F%u_D", 10, 11),
+ (sequence "F%u_D", 12, 17),
+ (sequence "F%u_D", 28, 31),
+ (sequence "F%u_D", 8, 9),
+ (sequence "F%u_D", 18, 27))>;
diff --git a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 1c5171a7b7a4..da5cd16e750c 100644
--- a/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -235,10 +235,10 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
MachineBasicBlock *LoopMBB,
MachineBasicBlock *DoneMBB,
AtomicRMWInst::BinOp BinOp, int Width) {
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned ScratchReg = MI.getOperand(1).getReg();
- unsigned AddrReg = MI.getOperand(2).getReg();
- unsigned IncrReg = MI.getOperand(3).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
+ Register ScratchReg = MI.getOperand(1).getReg();
+ Register AddrReg = MI.getOperand(2).getReg();
+ Register IncrReg = MI.getOperand(3).getReg();
AtomicOrdering Ordering =
static_cast<AtomicOrdering>(MI.getOperand(4).getImm());
@@ -271,9 +271,9 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
}
static void insertMaskedMerge(const RISCVInstrInfo *TII, DebugLoc DL,
- MachineBasicBlock *MBB, unsigned DestReg,
- unsigned OldValReg, unsigned NewValReg,
- unsigned MaskReg, unsigned ScratchReg) {
+ MachineBasicBlock *MBB, Register DestReg,
+ Register OldValReg, Register NewValReg,
+ Register MaskReg, Register ScratchReg) {
assert(OldValReg != ScratchReg && "OldValReg and ScratchReg must be unique");
assert(OldValReg != MaskReg && "OldValReg and MaskReg must be unique");
assert(ScratchReg != MaskReg && "ScratchReg and MaskReg must be unique");
@@ -297,11 +297,11 @@ static void doMaskedAtomicBinOpExpansion(
MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopMBB,
MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width) {
assert(Width == 32 && "Should never need to expand masked 64-bit operations");
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned ScratchReg = MI.getOperand(1).getReg();
- unsigned AddrReg = MI.getOperand(2).getReg();
- unsigned IncrReg = MI.getOperand(3).getReg();
- unsigned MaskReg = MI.getOperand(4).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
+ Register ScratchReg = MI.getOperand(1).getReg();
+ Register AddrReg = MI.getOperand(2).getReg();
+ Register IncrReg = MI.getOperand(3).getReg();
+ Register MaskReg = MI.getOperand(4).getReg();
AtomicOrdering Ordering =
static_cast<AtomicOrdering>(MI.getOperand(5).getImm());
@@ -394,8 +394,8 @@ bool RISCVExpandPseudo::expandAtomicBinOp(
}
static void insertSext(const RISCVInstrInfo *TII, DebugLoc DL,
- MachineBasicBlock *MBB, unsigned ValReg,
- unsigned ShamtReg) {
+ MachineBasicBlock *MBB, Register ValReg,
+ Register ShamtReg) {
BuildMI(MBB, DL, TII->get(RISCV::SLL), ValReg)
.addReg(ValReg)
.addReg(ShamtReg);
@@ -436,12 +436,12 @@ bool RISCVExpandPseudo::expandAtomicMinMaxOp(
DoneMBB->transferSuccessors(&MBB);
MBB.addSuccessor(LoopHeadMBB);
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned Scratch1Reg = MI.getOperand(1).getReg();
- unsigned Scratch2Reg = MI.getOperand(2).getReg();
- unsigned AddrReg = MI.getOperand(3).getReg();
- unsigned IncrReg = MI.getOperand(4).getReg();
- unsigned MaskReg = MI.getOperand(5).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
+ Register Scratch1Reg = MI.getOperand(1).getReg();
+ Register Scratch2Reg = MI.getOperand(2).getReg();
+ Register AddrReg = MI.getOperand(3).getReg();
+ Register IncrReg = MI.getOperand(4).getReg();
+ Register MaskReg = MI.getOperand(5).getReg();
bool IsSigned = BinOp == AtomicRMWInst::Min || BinOp == AtomicRMWInst::Max;
AtomicOrdering Ordering =
static_cast<AtomicOrdering>(MI.getOperand(IsSigned ? 7 : 6).getImm());
@@ -549,11 +549,11 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg(
DoneMBB->transferSuccessors(&MBB);
MBB.addSuccessor(LoopHeadMBB);
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned ScratchReg = MI.getOperand(1).getReg();
- unsigned AddrReg = MI.getOperand(2).getReg();
- unsigned CmpValReg = MI.getOperand(3).getReg();
- unsigned NewValReg = MI.getOperand(4).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
+ Register ScratchReg = MI.getOperand(1).getReg();
+ Register AddrReg = MI.getOperand(2).getReg();
+ Register CmpValReg = MI.getOperand(3).getReg();
+ Register NewValReg = MI.getOperand(4).getReg();
AtomicOrdering Ordering =
static_cast<AtomicOrdering>(MI.getOperand(IsMasked ? 6 : 5).getImm());
@@ -582,7 +582,7 @@ bool RISCVExpandPseudo::expandAtomicCmpXchg(
// lr.w dest, (addr)
// and scratch, dest, mask
// bne scratch, cmpval, done
- unsigned MaskReg = MI.getOperand(5).getReg();
+ Register MaskReg = MI.getOperand(5).getReg();
BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg)
.addReg(AddrReg);
BuildMI(LoopHeadMBB, DL, TII->get(RISCV::AND), ScratchReg)
@@ -629,7 +629,7 @@ bool RISCVExpandPseudo::expandAuipcInstPair(
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
const MachineOperand &Symbol = MI.getOperand(1);
MachineBasicBlock *NewMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
diff --git a/lib/Target/RISCV/RISCVFrameLowering.cpp b/lib/Target/RISCV/RISCVFrameLowering.cpp
index 32c3b9684d2c..6b6f62e18ce9 100644
--- a/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -40,8 +40,16 @@ void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
uint64_t FrameSize = MFI.getStackSize();
// Get the alignment.
- uint64_t StackAlign = RI->needsStackRealignment(MF) ? MFI.getMaxAlignment()
- : getStackAlignment();
+ unsigned StackAlign = getStackAlignment();
+ if (RI->needsStackRealignment(MF)) {
+ unsigned MaxStackAlign = std::max(StackAlign, MFI.getMaxAlignment());
+ FrameSize += (MaxStackAlign - StackAlign);
+ StackAlign = MaxStackAlign;
+ }
+
+ // Set Max Call Frame Size
+ uint64_t MaxCallSize = alignTo(MFI.getMaxCallFrameSize(), StackAlign);
+ MFI.setMaxCallFrameSize(MaxCallSize);
// Make sure the frame is aligned.
FrameSize = alignTo(FrameSize, StackAlign);
@@ -52,8 +60,8 @@ void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL, unsigned DestReg,
- unsigned SrcReg, int64_t Val,
+ const DebugLoc &DL, Register DestReg,
+ Register SrcReg, int64_t Val,
MachineInstr::MIFlag Flag) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const RISCVInstrInfo *TII = STI.getInstrInfo();
@@ -66,7 +74,7 @@ void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB,
.addReg(SrcReg)
.addImm(Val)
.setMIFlag(Flag);
- } else if (isInt<32>(Val)) {
+ } else {
unsigned Opc = RISCV::ADD;
bool isSub = Val < 0;
if (isSub) {
@@ -74,22 +82,20 @@ void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB,
Opc = RISCV::SUB;
}
- unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
- TII->movImm32(MBB, MBBI, DL, ScratchReg, Val, Flag);
+ Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ TII->movImm(MBB, MBBI, DL, ScratchReg, Val, Flag);
BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
.addReg(SrcReg)
.addReg(ScratchReg, RegState::Kill)
.setMIFlag(Flag);
- } else {
- report_fatal_error("adjustReg cannot yet handle adjustments >32 bits");
}
}
// Returns the register used to hold the frame pointer.
-static unsigned getFPReg(const RISCVSubtarget &STI) { return RISCV::X8; }
+static Register getFPReg(const RISCVSubtarget &STI) { return RISCV::X8; }
// Returns the register used to hold the stack pointer.
-static unsigned getSPReg(const RISCVSubtarget &STI) { return RISCV::X2; }
+static Register getSPReg(const RISCVSubtarget &STI) { return RISCV::X2; }
void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
@@ -101,8 +107,14 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
const RISCVInstrInfo *TII = STI.getInstrInfo();
MachineBasicBlock::iterator MBBI = MBB.begin();
- unsigned FPReg = getFPReg(STI);
- unsigned SPReg = getSPReg(STI);
+ if (RI->needsStackRealignment(MF) && MFI.hasVarSizedObjects()) {
+ report_fatal_error(
+ "RISC-V backend can't currently handle functions that need stack "
+ "realignment and have variable sized objects");
+ }
+
+ Register FPReg = getFPReg(STI);
+ Register SPReg = getSPReg(STI);
// Debug location must be unknown since the first debug location is used
// to determine the end of the prologue.
@@ -119,6 +131,11 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
if (StackSize == 0 && !MFI.adjustsStack())
return;
+ uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+ // Split the SP adjustment to reduce the offsets of callee saved spill.
+ if (FirstSPAdjustAmount)
+ StackSize = FirstSPAdjustAmount;
+
// Allocate space on the stack if necessary.
adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);
@@ -141,7 +158,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
// directives.
for (const auto &Entry : CSI) {
int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx());
- unsigned Reg = Entry.getReg();
+ Register Reg = Entry.getReg();
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
nullptr, RI->getDwarfRegNum(Reg, true), Offset));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
@@ -159,6 +176,45 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
}
+
+ // Emit the second SP adjustment after saving callee saved registers.
+ if (FirstSPAdjustAmount) {
+ uint64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount;
+ assert(SecondSPAdjustAmount > 0 &&
+ "SecondSPAdjustAmount should be greater than zero");
+ adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount,
+ MachineInstr::FrameSetup);
+ // Emit ".cfi_def_cfa_offset StackSize"
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize()));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+
+ if (hasFP(MF)) {
+ // Realign Stack
+ const RISCVRegisterInfo *RI = STI.getRegisterInfo();
+ if (RI->needsStackRealignment(MF)) {
+ unsigned MaxAlignment = MFI.getMaxAlignment();
+
+ const RISCVInstrInfo *TII = STI.getInstrInfo();
+ if (isInt<12>(-(int)MaxAlignment)) {
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::ANDI), SPReg)
+ .addReg(SPReg)
+ .addImm(-(int)MaxAlignment);
+ } else {
+ unsigned ShiftAmount = countTrailingZeros(MaxAlignment);
+ Register VR =
+ MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::SRLI), VR)
+ .addReg(SPReg)
+ .addImm(ShiftAmount);
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::SLLI), SPReg)
+ .addReg(VR)
+ .addImm(ShiftAmount);
+ }
+ }
+ }
}
void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -169,8 +225,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
DebugLoc DL = MBBI->getDebugLoc();
const RISCVInstrInfo *TII = STI.getInstrInfo();
- unsigned FPReg = getFPReg(STI);
- unsigned SPReg = getSPReg(STI);
+ Register FPReg = getFPReg(STI);
+ Register SPReg = getSPReg(STI);
// Skip to before the restores of callee-saved registers
// FIXME: assumes exactly one instruction is used to restore each
@@ -189,11 +245,29 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
MachineInstr::FrameDestroy);
}
+ uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+ if (FirstSPAdjustAmount) {
+ uint64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount;
+ assert(SecondSPAdjustAmount > 0 &&
+ "SecondSPAdjustAmount should be greater than zero");
+
+ adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg, SecondSPAdjustAmount,
+ MachineInstr::FrameDestroy);
+
+ // Emit ".cfi_def_cfa_offset FirstSPAdjustAmount"
+ unsigned CFIIndex =
+ MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(nullptr,
+ -FirstSPAdjustAmount));
+ BuildMI(MBB, LastFrameDestroy, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+
if (hasFP(MF)) {
// To find the instruction restoring FP from stack.
for (auto &I = LastFrameDestroy; I != MBBI; ++I) {
if (I->mayLoad() && I->getOperand(0).isReg()) {
- unsigned DestReg = I->getOperand(0).getReg();
+ Register DestReg = I->getOperand(0).getReg();
if (DestReg == FPReg) {
// If there is frame pointer, after restoring $fp registers, we
// need adjust CFA to ($sp - FPOffset).
@@ -214,13 +288,16 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
// Iterate over list of callee-saved registers and emit .cfi_restore
// directives.
for (const auto &Entry : CSI) {
- unsigned Reg = Entry.getReg();
+ Register Reg = Entry.getReg();
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
nullptr, RI->getDwarfRegNum(Reg, true)));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
}
+ if (FirstSPAdjustAmount)
+ StackSize = FirstSPAdjustAmount;
+
// Deallocate stack
adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
@@ -249,6 +326,8 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea() +
MFI.getOffsetAdjustment();
+ uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+
if (CSI.size()) {
MinCSFI = CSI[0].getFrameIdx();
MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
@@ -256,6 +335,17 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
if (FI >= MinCSFI && FI <= MaxCSFI) {
FrameReg = RISCV::X2;
+
+ if (FirstSPAdjustAmount)
+ Offset += FirstSPAdjustAmount;
+ else
+ Offset += MF.getFrameInfo().getStackSize();
+ } else if (RI->needsStackRealignment(MF)) {
+ assert(!MFI.hasVarSizedObjects() &&
+ "Unexpected combination of stack realignment and varsized objects");
+ // If the stack was realigned, the frame pointer is set in order to allow
+ // SP to be restored, but we still access stack objects using SP.
+ FrameReg = RISCV::X2;
Offset += MF.getFrameInfo().getStackSize();
} else {
FrameReg = RI->getFrameRegister(MF);
@@ -338,7 +428,7 @@ bool RISCVFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const {
- unsigned SPReg = RISCV::X2;
+ Register SPReg = RISCV::X2;
DebugLoc DL = MI->getDebugLoc();
if (!hasReservedCallFrame(MF)) {
@@ -362,3 +452,39 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
return MBB.erase(MI);
}
+
+// We would like to split the SP adjustment to reduce prologue/epilogue
+// as following instructions. In this way, the offset of the callee saved
+// register could fit in a single store.
+// add sp,sp,-2032
+// sw ra,2028(sp)
+// sw s0,2024(sp)
+// sw s1,2020(sp)
+// sw s3,2012(sp)
+// sw s4,2008(sp)
+// add sp,sp,-64
+uint64_t
+RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ uint64_t StackSize = MFI.getStackSize();
+ uint64_t StackAlign = getStackAlignment();
+
+ // FIXME: Disable SplitSPAdjust if save-restore libcall enabled when the patch
+ // landing. The callee saved registers will be pushed by the
+ // save-restore libcalls, so we don't have to split the SP adjustment
+ // in this case.
+ //
+ // Return the FirstSPAdjustAmount if the StackSize can not fit in signed
+ // 12-bit and there exists a callee saved register need to be pushed.
+ if (!isInt<12>(StackSize) && (CSI.size() > 0)) {
+ // FirstSPAdjustAmount is choosed as (2048 - StackAlign)
+ // because 2048 will cause sp = sp + 2048 in epilogue split into
+ // multi-instructions. The offset smaller than 2048 can fit in signle
+ // load/store instruction and we have to stick with the stack alignment.
+ // 2048 is 16-byte alignment. The stack alignment for RV32 and RV64 is 16,
+ // for RV32E is 4. So (2048 - StackAlign) will satisfy the stack alignment.
+ return 2048 - StackAlign;
+ }
+ return 0;
+}
diff --git a/lib/Target/RISCV/RISCVFrameLowering.h b/lib/Target/RISCV/RISCVFrameLowering.h
index 0e045c3ff853..f4a5949773d9 100644
--- a/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/lib/Target/RISCV/RISCVFrameLowering.h
@@ -22,7 +22,7 @@ class RISCVFrameLowering : public TargetFrameLowering {
public:
explicit RISCVFrameLowering(const RISCVSubtarget &STI)
: TargetFrameLowering(StackGrowsDown,
- /*StackAlignment=*/16,
+ /*StackAlignment=*/Align(16),
/*LocalAreaOffset=*/0),
STI(STI) {}
@@ -45,13 +45,18 @@ public:
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;
+ // Get the first stack adjustment amount for SplitSPAdjust.
+ // Return 0 if we don't want to to split the SP adjustment in prologue and
+ // epilogue.
+ uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const;
+
protected:
const RISCVSubtarget &STI;
private:
void determineFrameLayout(MachineFunction &MF) const;
void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ const DebugLoc &DL, Register DestReg, Register SrcReg,
int64_t Val, MachineInstr::MIFlag Flag) const;
};
}
diff --git a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index d0a3af375a6d..1a12d9177d2a 100644
--- a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -68,7 +68,7 @@ static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm,
RISCVMatInt::InstSeq Seq;
RISCVMatInt::generateInstSeq(Imm, XLenVT == MVT::i64, Seq);
- SDNode *Result;
+ SDNode *Result = nullptr;
SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT);
for (RISCVMatInt::Inst &Inst : Seq) {
SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT);
@@ -179,6 +179,9 @@ bool RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand(
// operand and need no special handling.
OutOps.push_back(Op);
return false;
+ case InlineAsm::Constraint_A:
+ OutOps.push_back(Op);
+ return false;
default:
break;
}
diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp
index ce7b85911ab6..dc829fce9013 100644
--- a/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -100,6 +100,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::ADD, MVT::i32, Custom);
+ setOperationAction(ISD::SUB, MVT::i32, Custom);
setOperationAction(ISD::SHL, MVT::i32, Custom);
setOperationAction(ISD::SRA, MVT::i32, Custom);
setOperationAction(ISD::SRL, MVT::i32, Custom);
@@ -116,6 +118,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
}
if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) {
+ setOperationAction(ISD::MUL, MVT::i32, Custom);
setOperationAction(ISD::SDIV, MVT::i32, Custom);
setOperationAction(ISD::UDIV, MVT::i32, Custom);
setOperationAction(ISD::UREM, MVT::i32, Custom);
@@ -194,8 +197,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setBooleanContents(ZeroOrOneBooleanContent);
- // Function alignments (log2).
- unsigned FunctionAlignment = Subtarget.hasStdExtC() ? 1 : 2;
+ // Function alignments.
+ const Align FunctionAlignment(Subtarget.hasStdExtC() ? 2 : 4);
setMinFunctionAlignment(FunctionAlignment);
setPrefFunctionAlignment(FunctionAlignment);
@@ -231,7 +234,7 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = 4;
+ Info.align = Align(4);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
MachineMemOperand::MOVolatile;
return true;
@@ -660,7 +663,7 @@ SDValue RISCVTargetLowering::lowerFRAMEADDR(SDValue Op,
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MFI.setFrameAddressIsTaken(true);
- unsigned FrameReg = RI.getFrameRegister(MF);
+ Register FrameReg = RI.getFrameRegister(MF);
int XLenInBytes = Subtarget.getXLen() / 8;
EVT VT = Op.getValueType();
@@ -703,7 +706,7 @@ SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op,
// Return the value of the return address register, marking it an implicit
// live-in.
- unsigned Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT));
+ Register Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT));
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT);
}
@@ -834,6 +837,18 @@ static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
}
+// Converts the given 32-bit operation to a i64 operation with signed extension
+// semantic to reduce the signed extension instructions.
+static SDValue customLegalizeToWOpWithSExt(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+ SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+ SDValue NewWOp = DAG.getNode(N->getOpcode(), DL, MVT::i64, NewOp0, NewOp1);
+ SDValue NewRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, NewWOp,
+ DAG.getValueType(MVT::i32));
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
+}
+
void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -854,6 +869,15 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(RCW.getValue(2));
break;
}
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+ if (N->getOperand(1).getOpcode() == ISD::Constant)
+ return;
+ Results.push_back(customLegalizeToWOpWithSExt(N, DAG));
+ break;
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
@@ -1007,12 +1031,14 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
// We can materialise `c1 << c2` into an add immediate, so it's "free",
// and the combine should happen, to potentially allow further combines
// later.
- if (isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
+ if (ShiftedC1Int.getMinSignedBits() <= 64 &&
+ isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
return true;
// We can materialise `c1` in an add immediate, so it's "free", and the
// combine should be prevented.
- if (isLegalAddImmediate(C1Int.getSExtValue()))
+ if (C1Int.getMinSignedBits() <= 64 &&
+ isLegalAddImmediate(C1Int.getSExtValue()))
return false;
// Neither constant will fit into an immediate, so find materialisation
@@ -1052,8 +1078,8 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
return 1;
}
-MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
- MachineBasicBlock *BB) {
+static MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
+ MachineBasicBlock *BB) {
assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction");
// To read the 64-bit cycle CSR on a 32-bit target, we read the two halves.
@@ -1085,9 +1111,9 @@ MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
BB->addSuccessor(LoopMBB);
MachineRegisterInfo &RegInfo = MF.getRegInfo();
- unsigned ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
- unsigned LoReg = MI.getOperand(0).getReg();
- unsigned HiReg = MI.getOperand(1).getReg();
+ Register ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
+ Register LoReg = MI.getOperand(0).getReg();
+ Register HiReg = MI.getOperand(1).getReg();
DebugLoc DL = MI.getDebugLoc();
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
@@ -1122,9 +1148,9 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
DebugLoc DL = MI.getDebugLoc();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
- unsigned LoReg = MI.getOperand(0).getReg();
- unsigned HiReg = MI.getOperand(1).getReg();
- unsigned SrcReg = MI.getOperand(2).getReg();
+ Register LoReg = MI.getOperand(0).getReg();
+ Register HiReg = MI.getOperand(1).getReg();
+ Register SrcReg = MI.getOperand(2).getReg();
const TargetRegisterClass *SrcRC = &RISCV::FPR64RegClass;
int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();
@@ -1154,9 +1180,9 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
DebugLoc DL = MI.getDebugLoc();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned LoReg = MI.getOperand(1).getReg();
- unsigned HiReg = MI.getOperand(2).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register LoReg = MI.getOperand(1).getReg();
+ Register HiReg = MI.getOperand(2).getReg();
const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass;
int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();
@@ -1215,12 +1241,12 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
// previous selects in the sequence.
// These conditions could be further relaxed. See the X86 target for a
// related approach and more information.
- unsigned LHS = MI.getOperand(1).getReg();
- unsigned RHS = MI.getOperand(2).getReg();
+ Register LHS = MI.getOperand(1).getReg();
+ Register RHS = MI.getOperand(2).getReg();
auto CC = static_cast<ISD::CondCode>(MI.getOperand(3).getImm());
SmallVector<MachineInstr *, 4> SelectDebugValues;
- SmallSet<unsigned, 4> SelectDests;
+ SmallSet<Register, 4> SelectDests;
SelectDests.insert(MI.getOperand(0).getReg());
MachineInstr *LastSelectPseudo = &MI;
@@ -1363,12 +1389,12 @@ static const MCPhysReg ArgGPRs[] = {
RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17
};
static const MCPhysReg ArgFPR32s[] = {
- RISCV::F10_32, RISCV::F11_32, RISCV::F12_32, RISCV::F13_32,
- RISCV::F14_32, RISCV::F15_32, RISCV::F16_32, RISCV::F17_32
+ RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F,
+ RISCV::F14_F, RISCV::F15_F, RISCV::F16_F, RISCV::F17_F
};
static const MCPhysReg ArgFPR64s[] = {
- RISCV::F10_64, RISCV::F11_64, RISCV::F12_64, RISCV::F13_64,
- RISCV::F14_64, RISCV::F15_64, RISCV::F16_64, RISCV::F17_64
+ RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D,
+ RISCV::F14_D, RISCV::F15_D, RISCV::F16_D, RISCV::F17_D
};
// Pass a 2*XLEN argument that has been split into two XLEN values through
@@ -1378,7 +1404,7 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
MVT ValVT2, MVT LocVT2,
ISD::ArgFlagsTy ArgFlags2) {
unsigned XLenInBytes = XLen / 8;
- if (unsigned Reg = State.AllocateReg(ArgGPRs)) {
+ if (Register Reg = State.AllocateReg(ArgGPRs)) {
// At least one half can be passed via register.
State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg,
VA1.getLocVT(), CCValAssign::Full));
@@ -1395,7 +1421,7 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
return false;
}
- if (unsigned Reg = State.AllocateReg(ArgGPRs)) {
+ if (Register Reg = State.AllocateReg(ArgGPRs)) {
// The second half can also be passed via register.
State.addLoc(
CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full));
@@ -1495,7 +1521,7 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
// GPRs, split between a GPR and the stack, or passed completely on the
// stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
// cases.
- unsigned Reg = State.AllocateReg(ArgGPRs);
+ Register Reg = State.AllocateReg(ArgGPRs);
LocVT = MVT::i32;
if (!Reg) {
unsigned StackOffset = State.AllocateStack(8, 8);
@@ -1537,7 +1563,7 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
}
// Allocate to a register if possible, or else a stack slot.
- unsigned Reg;
+ Register Reg;
if (ValVT == MVT::f32 && !UseGPRForF32)
Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s);
else if (ValVT == MVT::f64 && !UseGPRForF64)
@@ -1673,7 +1699,7 @@ static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
break;
}
- unsigned VReg = RegInfo.createVirtualRegister(RC);
+ Register VReg = RegInfo.createVirtualRegister(RC);
RegInfo.addLiveIn(VA.getLocReg(), VReg);
Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
@@ -1751,7 +1777,7 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
assert(VA.isRegLoc() && "Expected register VA assignment");
- unsigned LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
+ Register LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
RegInfo.addLiveIn(VA.getLocReg(), LoVReg);
SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32);
SDValue Hi;
@@ -1763,13 +1789,70 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
MachinePointerInfo::getFixedStack(MF, FI));
} else {
// Second half of f64 is passed in another GPR.
- unsigned HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
+ Register HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg);
Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32);
}
return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
}
+// FastCC has less than 1% performance improvement for some particular
+// benchmark. But theoretically, it may has benenfit for some cases.
+static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+ if (LocVT == MVT::i32 || LocVT == MVT::i64) {
+ // X5 and X6 might be used for save-restore libcall.
+ static const MCPhysReg GPRList[] = {
+ RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14,
+ RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X7, RISCV::X28,
+ RISCV::X29, RISCV::X30, RISCV::X31};
+ if (unsigned Reg = State.AllocateReg(GPRList)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ }
+
+ if (LocVT == MVT::f32) {
+ static const MCPhysReg FPR32List[] = {
+ RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, RISCV::F14_F,
+ RISCV::F15_F, RISCV::F16_F, RISCV::F17_F, RISCV::F0_F, RISCV::F1_F,
+ RISCV::F2_F, RISCV::F3_F, RISCV::F4_F, RISCV::F5_F, RISCV::F6_F,
+ RISCV::F7_F, RISCV::F28_F, RISCV::F29_F, RISCV::F30_F, RISCV::F31_F};
+ if (unsigned Reg = State.AllocateReg(FPR32List)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ }
+
+ if (LocVT == MVT::f64) {
+ static const MCPhysReg FPR64List[] = {
+ RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D, RISCV::F14_D,
+ RISCV::F15_D, RISCV::F16_D, RISCV::F17_D, RISCV::F0_D, RISCV::F1_D,
+ RISCV::F2_D, RISCV::F3_D, RISCV::F4_D, RISCV::F5_D, RISCV::F6_D,
+ RISCV::F7_D, RISCV::F28_D, RISCV::F29_D, RISCV::F30_D, RISCV::F31_D};
+ if (unsigned Reg = State.AllocateReg(FPR64List)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return false;
+ }
+ }
+
+ if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+ unsigned Offset4 = State.AllocateStack(4, 4);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset4, LocVT, LocInfo));
+ return false;
+ }
+
+ if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+ unsigned Offset5 = State.AllocateStack(8, 8);
+ State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset5, LocVT, LocInfo));
+ return false;
+ }
+
+ return true; // CC didn't match.
+}
+
// Transform physical registers into virtual registers.
SDValue RISCVTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
@@ -1809,7 +1892,11 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
- analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false);
+
+ if (CallConv == CallingConv::Fast)
+ CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_FastCC);
+ else
+ analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
@@ -1877,8 +1964,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
// ensure that the frame pointer is 2*XLEN-aligned, which in turn ensures
// offsets to even-numbered registered remain 2*XLEN-aligned.
if (Idx % 2) {
- FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes,
- true);
+ MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes, true);
VarArgsSaveSize += XLenInBytes;
}
@@ -1886,7 +1972,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
// to the vararg save area.
for (unsigned I = Idx; I < ArgRegs.size();
++I, VaArgOffset += XLenInBytes) {
- const unsigned Reg = RegInfo.createVirtualRegister(RC);
+ const Register Reg = RegInfo.createVirtualRegister(RC);
RegInfo.addLiveIn(ArgRegs[I], Reg);
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
@@ -1920,7 +2006,6 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization(
auto &Callee = CLI.Callee;
auto CalleeCC = CLI.CallConv;
- auto IsVarArg = CLI.IsVarArg;
auto &Outs = CLI.Outs;
auto &Caller = MF.getFunction();
auto CallerCC = Caller.getCallingConv();
@@ -1937,10 +2022,6 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization(
if (Caller.hasFnAttribute("interrupt"))
return false;
- // Do not tail call opt functions with varargs.
- if (IsVarArg)
- return false;
-
// Do not tail call opt if the stack is used to pass parameters.
if (CCInfo.getNextStackOffset() != 0)
return false;
@@ -2015,7 +2096,11 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Analyze the operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
- analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
+
+ if (CallConv == CallingConv::Fast)
+ ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_FastCC);
+ else
+ analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
// Check if it's really possible to do a tail call.
if (IsTailCall)
@@ -2057,7 +2142,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
// Copy argument values to their designated locations.
- SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
SDValue StackPtr;
for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -2074,7 +2159,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue Lo = SplitF64.getValue(0);
SDValue Hi = SplitF64.getValue(1);
- unsigned RegLo = VA.getLocReg();
+ Register RegLo = VA.getLocReg();
RegsToPass.push_back(std::make_pair(RegLo, Lo));
if (RegLo == RISCV::X17) {
@@ -2087,7 +2172,8 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo()));
} else {
// Second half of f64 is passed in another GPR.
- unsigned RegHigh = RegLo + 1;
+ assert(RegLo < RISCV::X31 && "Invalid register pair");
+ Register RegHigh = RegLo + 1;
RegsToPass.push_back(std::make_pair(RegHigh, Hi));
}
continue;
@@ -2302,8 +2388,9 @@ RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
DAG.getVTList(MVT::i32, MVT::i32), Val);
SDValue Lo = SplitF64.getValue(0);
SDValue Hi = SplitF64.getValue(1);
- unsigned RegLo = VA.getLocReg();
- unsigned RegHi = RegLo + 1;
+ Register RegLo = VA.getLocReg();
+ assert(RegLo < RISCV::X31 && "Invalid register pair");
+ Register RegHi = RegLo + 1;
Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(RegLo, MVT::i32));
@@ -2397,6 +2484,27 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
return nullptr;
}
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+RISCVTargetLowering::ConstraintType
+RISCVTargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default:
+ break;
+ case 'f':
+ return C_RegisterClass;
+ case 'I':
+ case 'J':
+ case 'K':
+ return C_Immediate;
+ case 'A':
+ return C_Memory;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
std::pair<unsigned, const TargetRegisterClass *>
RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
@@ -2407,14 +2515,125 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
switch (Constraint[0]) {
case 'r':
return std::make_pair(0U, &RISCV::GPRRegClass);
+ case 'f':
+ if (Subtarget.hasStdExtF() && VT == MVT::f32)
+ return std::make_pair(0U, &RISCV::FPR32RegClass);
+ if (Subtarget.hasStdExtD() && VT == MVT::f64)
+ return std::make_pair(0U, &RISCV::FPR64RegClass);
+ break;
default:
break;
}
}
+ // Clang will correctly decode the usage of register name aliases into their
+ // official names. However, other frontends like `rustc` do not. This allows
+ // users of these frontends to use the ABI names for registers in LLVM-style
+ // register constraints.
+ Register XRegFromAlias = StringSwitch<Register>(Constraint.lower())
+ .Case("{zero}", RISCV::X0)
+ .Case("{ra}", RISCV::X1)
+ .Case("{sp}", RISCV::X2)
+ .Case("{gp}", RISCV::X3)
+ .Case("{tp}", RISCV::X4)
+ .Case("{t0}", RISCV::X5)
+ .Case("{t1}", RISCV::X6)
+ .Case("{t2}", RISCV::X7)
+ .Cases("{s0}", "{fp}", RISCV::X8)
+ .Case("{s1}", RISCV::X9)
+ .Case("{a0}", RISCV::X10)
+ .Case("{a1}", RISCV::X11)
+ .Case("{a2}", RISCV::X12)
+ .Case("{a3}", RISCV::X13)
+ .Case("{a4}", RISCV::X14)
+ .Case("{a5}", RISCV::X15)
+ .Case("{a6}", RISCV::X16)
+ .Case("{a7}", RISCV::X17)
+ .Case("{s2}", RISCV::X18)
+ .Case("{s3}", RISCV::X19)
+ .Case("{s4}", RISCV::X20)
+ .Case("{s5}", RISCV::X21)
+ .Case("{s6}", RISCV::X22)
+ .Case("{s7}", RISCV::X23)
+ .Case("{s8}", RISCV::X24)
+ .Case("{s9}", RISCV::X25)
+ .Case("{s10}", RISCV::X26)
+ .Case("{s11}", RISCV::X27)
+ .Case("{t3}", RISCV::X28)
+ .Case("{t4}", RISCV::X29)
+ .Case("{t5}", RISCV::X30)
+ .Case("{t6}", RISCV::X31)
+ .Default(RISCV::NoRegister);
+ if (XRegFromAlias != RISCV::NoRegister)
+ return std::make_pair(XRegFromAlias, &RISCV::GPRRegClass);
+
+ // Since TargetLowering::getRegForInlineAsmConstraint uses the name of the
+ // TableGen record rather than the AsmName to choose registers for InlineAsm
+ // constraints, plus we want to match those names to the widest floating point
+ // register type available, manually select floating point registers here.
+ //
+ // The second case is the ABI name of the register, so that frontends can also
+ // use the ABI names in register constraint lists.
+ if (Subtarget.hasStdExtF() || Subtarget.hasStdExtD()) {
+ std::pair<Register, Register> FReg =
+ StringSwitch<std::pair<Register, Register>>(Constraint.lower())
+ .Cases("{f0}", "{ft0}", {RISCV::F0_F, RISCV::F0_D})
+ .Cases("{f1}", "{ft1}", {RISCV::F1_F, RISCV::F1_D})
+ .Cases("{f2}", "{ft2}", {RISCV::F2_F, RISCV::F2_D})
+ .Cases("{f3}", "{ft3}", {RISCV::F3_F, RISCV::F3_D})
+ .Cases("{f4}", "{ft4}", {RISCV::F4_F, RISCV::F4_D})
+ .Cases("{f5}", "{ft5}", {RISCV::F5_F, RISCV::F5_D})
+ .Cases("{f6}", "{ft6}", {RISCV::F6_F, RISCV::F6_D})
+ .Cases("{f7}", "{ft7}", {RISCV::F7_F, RISCV::F7_D})
+ .Cases("{f8}", "{fs0}", {RISCV::F8_F, RISCV::F8_D})
+ .Cases("{f9}", "{fs1}", {RISCV::F9_F, RISCV::F9_D})
+ .Cases("{f10}", "{fa0}", {RISCV::F10_F, RISCV::F10_D})
+ .Cases("{f11}", "{fa1}", {RISCV::F11_F, RISCV::F11_D})
+ .Cases("{f12}", "{fa2}", {RISCV::F12_F, RISCV::F12_D})
+ .Cases("{f13}", "{fa3}", {RISCV::F13_F, RISCV::F13_D})
+ .Cases("{f14}", "{fa4}", {RISCV::F14_F, RISCV::F14_D})
+ .Cases("{f15}", "{fa5}", {RISCV::F15_F, RISCV::F15_D})
+ .Cases("{f16}", "{fa6}", {RISCV::F16_F, RISCV::F16_D})
+ .Cases("{f17}", "{fa7}", {RISCV::F17_F, RISCV::F17_D})
+ .Cases("{f18}", "{fs2}", {RISCV::F18_F, RISCV::F18_D})
+ .Cases("{f19}", "{fs3}", {RISCV::F19_F, RISCV::F19_D})
+ .Cases("{f20}", "{fs4}", {RISCV::F20_F, RISCV::F20_D})
+ .Cases("{f21}", "{fs5}", {RISCV::F21_F, RISCV::F21_D})
+ .Cases("{f22}", "{fs6}", {RISCV::F22_F, RISCV::F22_D})
+ .Cases("{f23}", "{fs7}", {RISCV::F23_F, RISCV::F23_D})
+ .Cases("{f24}", "{fs8}", {RISCV::F24_F, RISCV::F24_D})
+ .Cases("{f25}", "{fs9}", {RISCV::F25_F, RISCV::F25_D})
+ .Cases("{f26}", "{fs10}", {RISCV::F26_F, RISCV::F26_D})
+ .Cases("{f27}", "{fs11}", {RISCV::F27_F, RISCV::F27_D})
+ .Cases("{f28}", "{ft8}", {RISCV::F28_F, RISCV::F28_D})
+ .Cases("{f29}", "{ft9}", {RISCV::F29_F, RISCV::F29_D})
+ .Cases("{f30}", "{ft10}", {RISCV::F30_F, RISCV::F30_D})
+ .Cases("{f31}", "{ft11}", {RISCV::F31_F, RISCV::F31_D})
+ .Default({RISCV::NoRegister, RISCV::NoRegister});
+ if (FReg.first != RISCV::NoRegister)
+ return Subtarget.hasStdExtD()
+ ? std::make_pair(FReg.second, &RISCV::FPR64RegClass)
+ : std::make_pair(FReg.first, &RISCV::FPR32RegClass);
+ }
+
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
+unsigned
+RISCVTargetLowering::getInlineAsmMemConstraint(StringRef ConstraintCode) const {
+ // Currently only support length 1 constraints.
+ if (ConstraintCode.size() == 1) {
+ switch (ConstraintCode[0]) {
+ case 'A':
+ return InlineAsm::Constraint_A;
+ default:
+ break;
+ }
+ }
+
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+}
+
void RISCVTargetLowering::LowerAsmOperandForConstraint(
SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
@@ -2619,3 +2838,13 @@ unsigned RISCVTargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
return RISCV::X11;
}
+
+bool RISCVTargetLowering::shouldExtendTypeInLibCall(EVT Type) const {
+ // Return false to suppress the unnecessary extensions if the LibCall
+ // arguments or return value is f32 type for LP64 ABI.
+ RISCVABI::ABI ABI = Subtarget.getTargetABI();
+ if (ABI == RISCVABI::ABI_LP64 && (Type == MVT::f32))
+ return false;
+
+ return true;
+}
diff --git a/lib/Target/RISCV/RISCVISelLowering.h b/lib/Target/RISCV/RISCVISelLowering.h
index 17db03bbb69e..18fc7350bbbf 100644
--- a/lib/Target/RISCV/RISCVISelLowering.h
+++ b/lib/Target/RISCV/RISCVISelLowering.h
@@ -92,6 +92,10 @@ public:
// This method returns the name of a target specific DAG node.
const char *getTargetNodeName(unsigned Opcode) const override;
+ ConstraintType getConstraintType(StringRef Constraint) const override;
+
+ unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override;
+
std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
@@ -141,6 +145,8 @@ public:
unsigned
getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+ bool shouldExtendTypeInLibCall(EVT Type) const override;
+
private:
void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins,
diff --git a/lib/Target/RISCV/RISCVInstrInfo.cpp b/lib/Target/RISCV/RISCVInstrInfo.cpp
index 99c8d2ef73de..084839299530 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -14,6 +14,7 @@
#include "RISCV.h"
#include "RISCVSubtarget.h"
#include "RISCVTargetMachine.h"
+#include "Utils/RISCVMatInt.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -28,8 +29,9 @@
using namespace llvm;
-RISCVInstrInfo::RISCVInstrInfo()
- : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP) {}
+RISCVInstrInfo::RISCVInstrInfo(RISCVSubtarget &STI)
+ : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP),
+ STI(STI) {}
unsigned RISCVInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
@@ -156,24 +158,43 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
BuildMI(MBB, I, DL, get(Opcode), DstReg).addFrameIndex(FI).addImm(0);
}
-void RISCVInstrInfo::movImm32(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL, unsigned DstReg, uint64_t Val,
- MachineInstr::MIFlag Flag) const {
- assert(isInt<32>(Val) && "Can only materialize 32-bit constants");
-
- // TODO: If the value can be materialized using only one instruction, only
- // insert a single instruction.
-
- uint64_t Hi20 = ((Val + 0x800) >> 12) & 0xfffff;
- uint64_t Lo12 = SignExtend64<12>(Val);
- BuildMI(MBB, MBBI, DL, get(RISCV::LUI), DstReg)
- .addImm(Hi20)
- .setMIFlag(Flag);
- BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
- .addReg(DstReg, RegState::Kill)
- .addImm(Lo12)
- .setMIFlag(Flag);
+void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, Register DstReg, uint64_t Val,
+ MachineInstr::MIFlag Flag) const {
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ bool IsRV64 = MF->getSubtarget<RISCVSubtarget>().is64Bit();
+ Register SrcReg = RISCV::X0;
+ Register Result = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ unsigned Num = 0;
+
+ if (!IsRV64 && !isInt<32>(Val))
+ report_fatal_error("Should only materialize 32-bit constants for RV32");
+
+ RISCVMatInt::InstSeq Seq;
+ RISCVMatInt::generateInstSeq(Val, IsRV64, Seq);
+ assert(Seq.size() > 0);
+
+ for (RISCVMatInt::Inst &Inst : Seq) {
+ // Write the final result to DstReg if it's the last instruction in the Seq.
+ // Otherwise, write the result to the temp register.
+ if (++Num == Seq.size())
+ Result = DstReg;
+
+ if (Inst.Opc == RISCV::LUI) {
+ BuildMI(MBB, MBBI, DL, get(RISCV::LUI), Result)
+ .addImm(Inst.Imm)
+ .setMIFlag(Flag);
+ } else {
+ BuildMI(MBB, MBBI, DL, get(Inst.Opc), Result)
+ .addReg(SrcReg, RegState::Kill)
+ .addImm(Inst.Imm)
+ .setMIFlag(Flag);
+ }
+ // Only the first instruction has X0 as its source.
+ SrcReg = Result;
+ }
}
// The contents of values added to Cond are not examined outside of
@@ -372,7 +393,7 @@ unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
// FIXME: A virtual register must be used initially, as the register
// scavenger won't work with empty blocks (SIInstrInfo::insertIndirectBranch
// uses the same workaround).
- unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
auto II = MBB.end();
MachineInstr &LuiMI = *BuildMI(MBB, II, DL, get(RISCV::LUI), ScratchReg)
@@ -466,3 +487,58 @@ bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
}
return MI.isAsCheapAsAMove();
}
+
+bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
+ StringRef &ErrInfo) const {
+ const MCInstrInfo *MCII = STI.getInstrInfo();
+ MCInstrDesc const &Desc = MCII->get(MI.getOpcode());
+
+ for (auto &OI : enumerate(Desc.operands())) {
+ unsigned OpType = OI.value().OperandType;
+ if (OpType >= RISCVOp::OPERAND_FIRST_RISCV_IMM &&
+ OpType <= RISCVOp::OPERAND_LAST_RISCV_IMM) {
+ const MachineOperand &MO = MI.getOperand(OI.index());
+ if (MO.isImm()) {
+ int64_t Imm = MO.getImm();
+ bool Ok;
+ switch (OpType) {
+ default:
+ llvm_unreachable("Unexpected operand type");
+ case RISCVOp::OPERAND_UIMM4:
+ Ok = isUInt<4>(Imm);
+ break;
+ case RISCVOp::OPERAND_UIMM5:
+ Ok = isUInt<5>(Imm);
+ break;
+ case RISCVOp::OPERAND_UIMM12:
+ Ok = isUInt<12>(Imm);
+ break;
+ case RISCVOp::OPERAND_SIMM12:
+ Ok = isInt<12>(Imm);
+ break;
+ case RISCVOp::OPERAND_SIMM13_LSB0:
+ Ok = isShiftedInt<12, 1>(Imm);
+ break;
+ case RISCVOp::OPERAND_UIMM20:
+ Ok = isUInt<20>(Imm);
+ break;
+ case RISCVOp::OPERAND_SIMM21_LSB0:
+ Ok = isShiftedInt<20, 1>(Imm);
+ break;
+ case RISCVOp::OPERAND_UIMMLOG2XLEN:
+ if (STI.getTargetTriple().isArch64Bit())
+ Ok = isUInt<6>(Imm);
+ else
+ Ok = isUInt<5>(Imm);
+ break;
+ }
+ if (!Ok) {
+ ErrInfo = "Invalid immediate";
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
diff --git a/lib/Target/RISCV/RISCVInstrInfo.h b/lib/Target/RISCV/RISCVInstrInfo.h
index ff098e660d19..d3ae04aefe04 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/lib/Target/RISCV/RISCVInstrInfo.h
@@ -21,10 +21,12 @@
namespace llvm {
+class RISCVSubtarget;
+
class RISCVInstrInfo : public RISCVGenInstrInfo {
public:
- RISCVInstrInfo();
+ explicit RISCVInstrInfo(RISCVSubtarget &STI);
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
@@ -46,10 +48,10 @@ public:
int FrameIndex, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
- // Materializes the given int32 Val into DstReg.
- void movImm32(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL, unsigned DstReg, uint64_t Val,
- MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
+ // Materializes the given integer Val into DstReg.
+ void movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, Register DstReg, uint64_t Val,
+ MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
@@ -80,6 +82,12 @@ public:
int64_t BrOffset) const override;
bool isAsCheapAsAMove(const MachineInstr &MI) const override;
+
+ bool verifyInstruction(const MachineInstr &MI,
+ StringRef &ErrInfo) const override;
+
+protected:
+ const RISCVSubtarget &STI;
};
}
#endif
diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td
index 69bde15f1218..db2ecc49d14e 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/lib/Target/RISCV/RISCVInstrInfo.td
@@ -69,6 +69,12 @@ class ImmAsmOperand<string prefix, int width, string suffix> : AsmOperandClass {
let DiagnosticType = !strconcat("Invalid", Name);
}
+def ImmZeroAsmOperand : AsmOperandClass {
+ let Name = "ImmZero";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = !strconcat("Invalid", Name);
+}
+
class SImmAsmOperand<int width, string suffix = "">
: ImmAsmOperand<"S", width, suffix> {
}
@@ -87,6 +93,8 @@ def fencearg : Operand<XLenVT> {
let ParserMatchClass = FenceArg;
let PrintMethod = "printFenceArg";
let DecoderMethod = "decodeUImmOperand<4>";
+ let OperandType = "OPERAND_UIMM4";
+ let OperandNamespace = "RISCVOp";
}
def UImmLog2XLenAsmOperand : AsmOperandClass {
@@ -111,11 +119,15 @@ def uimmlog2xlen : Operand<XLenVT>, ImmLeaf<XLenVT, [{
return isUInt<6>(Imm);
return isUInt<5>(Imm);
}];
+ let OperandType = "OPERAND_UIMMLOG2XLEN";
+ let OperandNamespace = "RISCVOp";
}
def uimm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isUInt<5>(Imm);}]> {
let ParserMatchClass = UImmAsmOperand<5>;
let DecoderMethod = "decodeUImmOperand<5>";
+ let OperandType = "OPERAND_UIMM5";
+ let OperandNamespace = "RISCVOp";
}
def simm12 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<12>(Imm);}]> {
@@ -128,6 +140,8 @@ def simm12 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<12>(Imm);}]> {
return isInt<12>(Imm);
return MCOp.isBareSymbolRef();
}];
+ let OperandType = "OPERAND_SIMM12";
+ let OperandNamespace = "RISCVOp";
}
// A 13-bit signed immediate where the least significant bit is zero.
@@ -141,6 +155,8 @@ def simm13_lsb0 : Operand<OtherVT> {
return isShiftedInt<12, 1>(Imm);
return MCOp.isBareSymbolRef();
}];
+ let OperandType = "OPERAND_SIMM13_LSB0";
+ let OperandNamespace = "RISCVOp";
}
class UImm20Operand : Operand<XLenVT> {
@@ -152,6 +168,8 @@ class UImm20Operand : Operand<XLenVT> {
return isUInt<20>(Imm);
return MCOp.isBareSymbolRef();
}];
+ let OperandType = "OPERAND_UIMM20";
+ let OperandNamespace = "RISCVOp";
}
def uimm20_lui : UImm20Operand {
@@ -176,6 +194,8 @@ def simm21_lsb0_jal : Operand<OtherVT> {
return isShiftedInt<20, 1>(Imm);
return MCOp.isBareSymbolRef();
}];
+ let OperandType = "OPERAND_SIMM21_LSB0";
+ let OperandNamespace = "RISCVOp";
}
def BareSymbol : AsmOperandClass {
@@ -224,6 +244,8 @@ def csr_sysreg : Operand<XLenVT> {
let ParserMatchClass = CSRSystemRegister;
let PrintMethod = "printCSRSystemRegister";
let DecoderMethod = "decodeUImmOperand<12>";
+ let OperandType = "OPERAND_UIMM12";
+ let OperandNamespace = "RISCVOp";
}
// A parameterized register class alternative to i32imm/i64imm from Target.td.
diff --git a/lib/Target/RISCV/RISCVInstrInfoA.td b/lib/Target/RISCV/RISCVInstrInfoA.td
index b768c9347b38..38ba3f9fb24e 100644
--- a/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -12,14 +12,32 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+// A parse method for (${gpr}) or 0(${gpr}), where the 0 is be silently ignored.
+// Used for GNU as Compatibility.
+def AtomicMemOpOperand : AsmOperandClass {
+ let Name = "AtomicMemOpOperand";
+ let RenderMethod = "addRegOperands";
+ let PredicateMethod = "isReg";
+ let ParserMethod = "parseAtomicMemOp";
+}
+
+def GPRMemAtomic : RegisterOperand<GPR> {
+ let ParserMatchClass = AtomicMemOpOperand;
+ let PrintMethod = "printAtomicMemOp";
+}
+
+//===----------------------------------------------------------------------===//
// Instruction class templates
//===----------------------------------------------------------------------===//
let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
class LR_r<bit aq, bit rl, bits<3> funct3, string opcodestr>
: RVInstRAtomic<0b00010, aq, rl, funct3, OPC_AMO,
- (outs GPR:$rd), (ins GPR:$rs1),
- opcodestr, "$rd, (${rs1})"> {
+ (outs GPR:$rd), (ins GPRMemAtomic:$rs1),
+ opcodestr, "$rd, $rs1"> {
let rs2 = 0;
}
@@ -33,8 +51,8 @@ multiclass LR_r_aq_rl<bits<3> funct3, string opcodestr> {
let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in
class AMO_rr<bits<5> funct5, bit aq, bit rl, bits<3> funct3, string opcodestr>
: RVInstRAtomic<funct5, aq, rl, funct3, OPC_AMO,
- (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
- opcodestr, "$rd, $rs2, (${rs1})">;
+ (outs GPR:$rd), (ins GPRMemAtomic:$rs1, GPR:$rs2),
+ opcodestr, "$rd, $rs2, $rs1">;
multiclass AMO_rr_aq_rl<bits<5> funct5, bits<3> funct3, string opcodestr> {
def "" : AMO_rr<funct5, 0, 0, funct3, opcodestr>;
@@ -196,12 +214,12 @@ class PseudoMaskedAMOUMinUMax
}
class PseudoMaskedAMOPat<Intrinsic intrin, Pseudo AMOInst>
- : Pat<(intrin GPR:$addr, GPR:$incr, GPR:$mask, imm:$ordering),
+ : Pat<(intrin GPR:$addr, GPR:$incr, GPR:$mask, timm:$ordering),
(AMOInst GPR:$addr, GPR:$incr, GPR:$mask, imm:$ordering)>;
class PseudoMaskedAMOMinMaxPat<Intrinsic intrin, Pseudo AMOInst>
: Pat<(intrin GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt,
- imm:$ordering),
+ timm:$ordering),
(AMOInst GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt,
imm:$ordering)>;
@@ -270,7 +288,7 @@ def PseudoMaskedCmpXchg32
}
def : Pat<(int_riscv_masked_cmpxchg_i32
- GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering),
+ GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering),
(PseudoMaskedCmpXchg32
GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>;
@@ -347,7 +365,7 @@ def PseudoCmpXchg64 : PseudoCmpXchg;
defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64>;
def : Pat<(int_riscv_masked_cmpxchg_i64
- GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering),
+ GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering),
(PseudoMaskedCmpXchg32
GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>;
} // Predicates = [HasStdExtA, IsRV64]
diff --git a/lib/Target/RISCV/RISCVInstrInfoC.td b/lib/Target/RISCV/RISCVInstrInfoC.td
index 94477341eea7..fa0050f107b2 100644
--- a/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -61,6 +61,11 @@ def simm6nonzero : Operand<XLenVT>,
}];
}
+def immzero : Operand<XLenVT>,
+ ImmLeaf<XLenVT, [{return (Imm == 0);}]> {
+ let ParserMatchClass = ImmZeroAsmOperand;
+}
+
def CLUIImmAsmOperand : AsmOperandClass {
let Name = "CLUIImm";
let RenderMethod = "addImmOperands";
@@ -132,7 +137,8 @@ def uimm8_lsb000 : Operand<XLenVT>,
}
// A 9-bit signed immediate where the least significant bit is zero.
-def simm9_lsb0 : Operand<OtherVT> {
+def simm9_lsb0 : Operand<OtherVT>,
+ ImmLeaf<XLenVT, [{return isShiftedInt<8, 1>(Imm);}]> {
let ParserMatchClass = SImmAsmOperand<9, "Lsb0">;
let EncoderMethod = "getImmOpValueAsr1";
let DecoderMethod = "decodeSImmOperandAndLsl1<9>";
@@ -191,7 +197,8 @@ def simm10_lsb0000nonzero : Operand<XLenVT>,
}
// A 12-bit signed immediate where the least significant bit is zero.
-def simm12_lsb0 : Operand<XLenVT> {
+def simm12_lsb0 : Operand<XLenVT>,
+ ImmLeaf<XLenVT, [{return isShiftedInt<11, 1>(Imm);}]> {
let ParserMatchClass = SImmAsmOperand<12, "Lsb0">;
let EncoderMethod = "getImmOpValueAsr1";
let DecoderMethod = "decodeSImmOperandAndLsl1<12>";
@@ -344,7 +351,10 @@ def C_SD : CStore_rri<0b111, "c.sd", GPRC, uimm8_lsb000> {
}
let rd = 0, imm = 0, hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">;
+def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">
+{
+ let Inst{6-2} = 0;
+}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
@@ -354,6 +364,15 @@ def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
let Inst{6-2} = imm{4-0};
}
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+def C_ADDI_NOP : RVInst16CI<0b000, 0b01, (outs GPRX0:$rd_wb),
+ (ins GPRX0:$rd, immzero:$imm),
+ "c.addi", "$rd, $imm"> {
+ let Constraints = "$rd = $rd_wb";
+ let Inst{6-2} = 0;
+ let isAsmParserOnly = 1;
+}
+
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1,
DecoderNamespace = "RISCV32Only_", Defs = [X1],
Predicates = [HasStdExtC, IsRV32] in
@@ -523,6 +542,105 @@ def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther> {
} // Predicates = [HasStdExtC]
//===----------------------------------------------------------------------===//
+// HINT Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtC, HasRVCHints], hasSideEffects = 0, mayLoad = 0,
+ mayStore = 0 in
+{
+
+let rd = 0 in
+def C_NOP_HINT : RVInst16CI<0b000, 0b01, (outs), (ins simm6nonzero:$imm),
+ "c.nop", "$imm"> {
+ let Inst{6-2} = imm{4-0};
+ let DecoderMethod = "decodeRVCInstrSImm";
+}
+
+// Just a different syntax for the c.nop hint: c.addi x0, simm6 vs c.nop simm6.
+def C_ADDI_HINT_X0 : RVInst16CI<0b000, 0b01, (outs GPRX0:$rd_wb),
+ (ins GPRX0:$rd, simm6nonzero:$imm),
+ "c.addi", "$rd, $imm"> {
+ let Constraints = "$rd = $rd_wb";
+ let Inst{6-2} = imm{4-0};
+ let isAsmParserOnly = 1;
+}
+
+def C_ADDI_HINT_IMM_ZERO : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
+ (ins GPRNoX0:$rd, immzero:$imm),
+ "c.addi", "$rd, $imm"> {
+ let Constraints = "$rd = $rd_wb";
+ let Inst{6-2} = 0;
+ let isAsmParserOnly = 1;
+}
+
+def C_LI_HINT : RVInst16CI<0b010, 0b01, (outs GPRX0:$rd), (ins simm6:$imm),
+ "c.li", "$rd, $imm"> {
+ let Inst{6-2} = imm{4-0};
+ let Inst{11-7} = 0;
+ let DecoderMethod = "decodeRVCInstrRdSImm";
+}
+
+def C_LUI_HINT : RVInst16CI<0b011, 0b01, (outs GPRX0:$rd),
+ (ins c_lui_imm:$imm),
+ "c.lui", "$rd, $imm"> {
+ let Inst{6-2} = imm{4-0};
+ let Inst{11-7} = 0;
+ let DecoderMethod = "decodeRVCInstrRdSImm";
+}
+
+def C_MV_HINT : RVInst16CR<0b1000, 0b10, (outs GPRX0:$rs1), (ins GPRNoX0:$rs2),
+ "c.mv", "$rs1, $rs2">
+{
+ let Inst{11-7} = 0;
+ let DecoderMethod = "decodeRVCInstrRdRs2";
+}
+
+def C_ADD_HINT : RVInst16CR<0b1001, 0b10, (outs GPRX0:$rs1_wb),
+ (ins GPRX0:$rs1, GPRNoX0:$rs2),
+ "c.add", "$rs1, $rs2"> {
+ let Constraints = "$rs1 = $rs1_wb";
+ let Inst{11-7} = 0;
+ let DecoderMethod = "decodeRVCInstrRdRs1Rs2";
+}
+
+def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb),
+ (ins GPRX0:$rd, uimmlog2xlennonzero:$imm),
+ "c.slli" ,"$rd, $imm"> {
+ let Constraints = "$rd = $rd_wb";
+ let Inst{6-2} = imm{4-0};
+ let Inst{11-7} = 0;
+ let DecoderMethod = "decodeRVCInstrRdRs1UImm";
+}
+
+def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd),
+ "c.slli64" ,"$rd"> {
+ let Constraints = "$rd = $rd_wb";
+ let Inst{6-2} = 0;
+ let Inst{12} = 0;
+}
+
+def C_SRLI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb),
+ (ins GPRC:$rd),
+ "c.srli64", "$rd"> {
+ let Constraints = "$rd = $rd_wb";
+ let Inst{6-2} = 0;
+ let Inst{11-10} = 0;
+ let Inst{12} = 0;
+}
+
+def C_SRAI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb),
+ (ins GPRC:$rd),
+ "c.srai64", "$rd"> {
+ let Constraints = "$rd = $rd_wb";
+ let Inst{6-2} = 0;
+ let Inst{11-10} = 1;
+ let Inst{12} = 0;
+}
+
+} // Predicates = [HasStdExtC, HasRVCHints], hasSideEffects = 0, mayLoad = 0,
+ // mayStore = 0
+
+//===----------------------------------------------------------------------===//
// Assembler Pseudo Instructions
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/RISCV/RISCVInstrInfoF.td b/lib/Target/RISCV/RISCVInstrInfoF.td
index 032642942f2b..3b73c865ea17 100644
--- a/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -227,6 +227,12 @@ def : InstAlias<"frcsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 2>;
def : InstAlias<"fscsr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs)>;
def : InstAlias<"fscsr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 2>;
+// frsr, fssr are obsolete aliases replaced by frcsr, fscsr, so give them
+// zero weight.
+def : InstAlias<"frsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 0>;
+def : InstAlias<"fssr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs), 0>;
+def : InstAlias<"fssr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 0>;
+
def : InstAlias<"frrm $rd", (CSRRS GPR:$rd, FRM.Encoding, X0), 2>;
def : InstAlias<"fsrm $rd, $rs", (CSRRW GPR:$rd, FRM.Encoding, GPR:$rs)>;
def : InstAlias<"fsrm $rs", (CSRRW X0, FRM.Encoding, GPR:$rs), 2>;
diff --git a/lib/Target/RISCV/RISCVInstructionSelector.cpp b/lib/Target/RISCV/RISCVInstructionSelector.cpp
new file mode 100644
index 000000000000..5bd09a546114
--- /dev/null
+++ b/lib/Target/RISCV/RISCVInstructionSelector.cpp
@@ -0,0 +1,103 @@
+//===-- RISCVInstructionSelector.cpp -----------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// RISCV.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "RISCVRegisterBankInfo.h"
+#include "RISCVSubtarget.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "riscv-isel"
+
+using namespace llvm;
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "RISCVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+namespace {
+
+class RISCVInstructionSelector : public InstructionSelector {
+public:
+ RISCVInstructionSelector(const RISCVTargetMachine &TM,
+ const RISCVSubtarget &STI,
+ const RISCVRegisterBankInfo &RBI);
+
+ bool select(MachineInstr &I) override;
+ static const char *getName() { return DEBUG_TYPE; }
+
+private:
+ bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
+
+ const RISCVSubtarget &STI;
+ const RISCVInstrInfo &TII;
+ const RISCVRegisterInfo &TRI;
+ const RISCVRegisterBankInfo &RBI;
+
+ // FIXME: This is necessary because DAGISel uses "Subtarget->" and GlobalISel
+ // uses "STI." in the code generated by TableGen. We need to unify the name of
+ // Subtarget variable.
+ const RISCVSubtarget *Subtarget = &STI;
+
+#define GET_GLOBALISEL_PREDICATES_DECL
+#include "RISCVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "RISCVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "RISCVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+RISCVInstructionSelector::RISCVInstructionSelector(
+ const RISCVTargetMachine &TM, const RISCVSubtarget &STI,
+ const RISCVRegisterBankInfo &RBI)
+ : InstructionSelector(), STI(STI), TII(*STI.getInstrInfo()),
+ TRI(*STI.getRegisterInfo()), RBI(RBI),
+
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "RISCVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "RISCVGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
+
+bool RISCVInstructionSelector::select(MachineInstr &I) {
+
+ if (!isPreISelGenericOpcode(I.getOpcode())) {
+ // Certain non-generic instructions also need some special handling.
+ return true;
+ }
+
+ if (selectImpl(I, *CoverageInfo))
+ return true;
+
+ return false;
+}
+
+namespace llvm {
+InstructionSelector *
+createRISCVInstructionSelector(const RISCVTargetMachine &TM,
+ RISCVSubtarget &Subtarget,
+ RISCVRegisterBankInfo &RBI) {
+ return new RISCVInstructionSelector(TM, Subtarget, RBI);
+}
+} // end namespace llvm
diff --git a/lib/Target/RISCV/RISCVLegalizerInfo.cpp b/lib/Target/RISCV/RISCVLegalizerInfo.cpp
new file mode 100644
index 000000000000..c92f4a3ee17b
--- /dev/null
+++ b/lib/Target/RISCV/RISCVLegalizerInfo.cpp
@@ -0,0 +1,23 @@
+//===-- RISCVLegalizerInfo.cpp ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for RISCV.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "RISCVLegalizerInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+
+using namespace llvm;
+
+RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
+ computeTables();
+}
diff --git a/lib/Target/RISCV/RISCVLegalizerInfo.h b/lib/Target/RISCV/RISCVLegalizerInfo.h
new file mode 100644
index 000000000000..f2c2b9a3fd46
--- /dev/null
+++ b/lib/Target/RISCV/RISCVLegalizerInfo.h
@@ -0,0 +1,28 @@
+//===-- RISCVLegalizerInfo.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for RISCV.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class RISCVSubtarget;
+
+/// This class provides the information for the target register banks.
+class RISCVLegalizerInfo : public LegalizerInfo {
+public:
+ RISCVLegalizerInfo(const RISCVSubtarget &ST);
+};
+} // end namespace llvm
+#endif
diff --git a/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 82b1209cb8e7..4c9013aa1e23 100644
--- a/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -45,7 +45,7 @@ struct RISCVMergeBaseOffsetOpt : public MachineFunctionPass {
bool detectAndFoldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI);
void foldOffset(MachineInstr &HiLUI, MachineInstr &LoADDI, MachineInstr &Tail,
int64_t Offset);
- bool matchLargeOffset(MachineInstr &TailAdd, unsigned GSReg, int64_t &Offset);
+ bool matchLargeOffset(MachineInstr &TailAdd, Register GSReg, int64_t &Offset);
RISCVMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
MachineFunctionProperties getRequiredProperties() const override {
@@ -85,7 +85,7 @@ bool RISCVMergeBaseOffsetOpt::detectLuiAddiGlobal(MachineInstr &HiLUI,
HiLUI.getOperand(1).getOffset() != 0 ||
!MRI->hasOneUse(HiLUI.getOperand(0).getReg()))
return false;
- unsigned HiLuiDestReg = HiLUI.getOperand(0).getReg();
+ Register HiLuiDestReg = HiLUI.getOperand(0).getReg();
LoADDI = MRI->use_begin(HiLuiDestReg)->getParent();
if (LoADDI->getOpcode() != RISCV::ADDI ||
LoADDI->getOperand(2).getTargetFlags() != RISCVII::MO_LO ||
@@ -132,12 +132,12 @@ void RISCVMergeBaseOffsetOpt::foldOffset(MachineInstr &HiLUI,
// \ /
// TailAdd: add vreg4, vreg2, voff
bool RISCVMergeBaseOffsetOpt::matchLargeOffset(MachineInstr &TailAdd,
- unsigned GAReg,
+ Register GAReg,
int64_t &Offset) {
assert((TailAdd.getOpcode() == RISCV::ADD) && "Expected ADD instruction!");
- unsigned Rs = TailAdd.getOperand(1).getReg();
- unsigned Rt = TailAdd.getOperand(2).getReg();
- unsigned Reg = Rs == GAReg ? Rt : Rs;
+ Register Rs = TailAdd.getOperand(1).getReg();
+ Register Rt = TailAdd.getOperand(2).getReg();
+ Register Reg = Rs == GAReg ? Rt : Rs;
// Can't fold if the register has more than one use.
if (!MRI->hasOneUse(Reg))
@@ -178,7 +178,7 @@ bool RISCVMergeBaseOffsetOpt::matchLargeOffset(MachineInstr &TailAdd,
bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI,
MachineInstr &LoADDI) {
- unsigned DestReg = LoADDI.getOperand(0).getReg();
+ Register DestReg = LoADDI.getOperand(0).getReg();
assert(MRI->hasOneUse(DestReg) && "expected one use for LoADDI");
// LoADDI has only one use.
MachineInstr &Tail = *MRI->use_begin(DestReg)->getParent();
@@ -232,7 +232,7 @@ bool RISCVMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &HiLUI,
return false;
// Register defined by LoADDI should be used in the base part of the
// load\store instruction. Otherwise, no folding possible.
- unsigned BaseAddrReg = Tail.getOperand(1).getReg();
+ Register BaseAddrReg = Tail.getOperand(1).getReg();
if (DestReg != BaseAddrReg)
return false;
MachineOperand &TailImmOp = Tail.getOperand(2);
diff --git a/lib/Target/RISCV/RISCVRegisterBankInfo.cpp b/lib/Target/RISCV/RISCVRegisterBankInfo.cpp
new file mode 100644
index 000000000000..bd3b95a98b9f
--- /dev/null
+++ b/lib/Target/RISCV/RISCVRegisterBankInfo.cpp
@@ -0,0 +1,26 @@
+//===-- RISCVRegisterBankInfo.cpp -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for RISCV.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "RISCVRegisterBankInfo.h"
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "RISCVGenRegisterBank.inc"
+
+using namespace llvm;
+
+RISCVRegisterBankInfo::RISCVRegisterBankInfo(const TargetRegisterInfo &TRI)
+ : RISCVGenRegisterBankInfo() {}
diff --git a/lib/Target/RISCV/RISCVRegisterBankInfo.h b/lib/Target/RISCV/RISCVRegisterBankInfo.h
new file mode 100644
index 000000000000..05fac992734d
--- /dev/null
+++ b/lib/Target/RISCV/RISCVRegisterBankInfo.h
@@ -0,0 +1,37 @@
+//===-- RISCVRegisterBankInfo.h ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for RISCV.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_RISCV_RISCVREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+#define GET_REGBANK_DECLARATIONS
+#include "RISCVGenRegisterBank.inc"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+class RISCVGenRegisterBankInfo : public RegisterBankInfo {
+protected:
+#define GET_TARGET_REGBANK_CLASS
+#include "RISCVGenRegisterBank.inc"
+};
+
+/// This class provides the information for the target register banks.
+class RISCVRegisterBankInfo final : public RISCVGenRegisterBankInfo {
+public:
+ RISCVRegisterBankInfo(const TargetRegisterInfo &TRI);
+};
+} // end namespace llvm
+#endif
diff --git a/lib/Target/RISCV/RISCVRegisterBanks.td b/lib/Target/RISCV/RISCVRegisterBanks.td
new file mode 100644
index 000000000000..400b65a1bf9a
--- /dev/null
+++ b/lib/Target/RISCV/RISCVRegisterBanks.td
@@ -0,0 +1,13 @@
+//=-- RISCVRegisterBank.td - Describe the RISCV Banks --------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+/// General Purpose Registers: X.
+def GPRRegBank : RegisterBank<"GPRB", [GPR]>;
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.cpp b/lib/Target/RISCV/RISCVRegisterInfo.cpp
index e6a126e3e513..66557687c0b6 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -26,6 +26,15 @@
using namespace llvm;
+static_assert(RISCV::X1 == RISCV::X0 + 1, "Register list not consecutive");
+static_assert(RISCV::X31 == RISCV::X0 + 31, "Register list not consecutive");
+static_assert(RISCV::F1_F == RISCV::F0_F + 1, "Register list not consecutive");
+static_assert(RISCV::F31_F == RISCV::F0_F + 31,
+ "Register list not consecutive");
+static_assert(RISCV::F1_D == RISCV::F0_D + 1, "Register list not consecutive");
+static_assert(RISCV::F31_D == RISCV::F0_D + 31,
+ "Register list not consecutive");
+
RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode)
: RISCVGenRegisterInfo(RISCV::X1, /*DwarfFlavour*/0, /*EHFlavor*/0,
/*PC*/0, HwMode) {}
@@ -109,8 +118,8 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
assert(isInt<32>(Offset) && "Int32 expected");
// The offset won't fit in an immediate, so use a scratch register instead
// Modify Offset and FrameReg appropriately
- unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
- TII->movImm32(MBB, II, DL, ScratchReg, Offset);
+ Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+ TII->movImm(MBB, II, DL, ScratchReg, Offset);
BuildMI(MBB, II, DL, TII->get(RISCV::ADD), ScratchReg)
.addReg(FrameReg)
.addReg(ScratchReg, RegState::Kill);
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.h b/lib/Target/RISCV/RISCVRegisterInfo.h
index 4f339475508f..56a50fe6ddc0 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -52,6 +52,12 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
bool trackLivenessAfterRegAlloc(const MachineFunction &) const override {
return true;
}
+
+ const TargetRegisterClass *
+ getPointerRegClass(const MachineFunction &MF,
+ unsigned Kind = 0) const override {
+ return &RISCV::GPRRegClass;
+ }
};
}
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.td b/lib/Target/RISCV/RISCVRegisterInfo.td
index 79f8ab12f6c0..82b37afd0805 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -101,6 +101,12 @@ def GPR : RegisterClass<"RISCV", [XLenVT], 32, (add
[RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
}
+def GPRX0 : RegisterClass<"RISCV", [XLenVT], 32, (add X0)> {
+ let RegInfos = RegInfoByHwMode<
+ [RV32, RV64, DefaultMode],
+ [RegInfo<32,32,32>, RegInfo<64,64,64>, RegInfo<32,32,32>]>;
+}
+
// The order of registers represents the preferred allocation sequence.
// Registers are listed in the order caller-save, callee-save, specials.
def GPRNoX0 : RegisterClass<"RISCV", [XLenVT], 32, (add
@@ -159,41 +165,41 @@ def SP : RegisterClass<"RISCV", [XLenVT], 32, (add X2)> {
// Floating point registers
let RegAltNameIndices = [ABIRegAltName] in {
- def F0_32 : RISCVReg32<0, "f0", ["ft0"]>, DwarfRegNum<[32]>;
- def F1_32 : RISCVReg32<1, "f1", ["ft1"]>, DwarfRegNum<[33]>;
- def F2_32 : RISCVReg32<2, "f2", ["ft2"]>, DwarfRegNum<[34]>;
- def F3_32 : RISCVReg32<3, "f3", ["ft3"]>, DwarfRegNum<[35]>;
- def F4_32 : RISCVReg32<4, "f4", ["ft4"]>, DwarfRegNum<[36]>;
- def F5_32 : RISCVReg32<5, "f5", ["ft5"]>, DwarfRegNum<[37]>;
- def F6_32 : RISCVReg32<6, "f6", ["ft6"]>, DwarfRegNum<[38]>;
- def F7_32 : RISCVReg32<7, "f7", ["ft7"]>, DwarfRegNum<[39]>;
- def F8_32 : RISCVReg32<8, "f8", ["fs0"]>, DwarfRegNum<[40]>;
- def F9_32 : RISCVReg32<9, "f9", ["fs1"]>, DwarfRegNum<[41]>;
- def F10_32 : RISCVReg32<10,"f10", ["fa0"]>, DwarfRegNum<[42]>;
- def F11_32 : RISCVReg32<11,"f11", ["fa1"]>, DwarfRegNum<[43]>;
- def F12_32 : RISCVReg32<12,"f12", ["fa2"]>, DwarfRegNum<[44]>;
- def F13_32 : RISCVReg32<13,"f13", ["fa3"]>, DwarfRegNum<[45]>;
- def F14_32 : RISCVReg32<14,"f14", ["fa4"]>, DwarfRegNum<[46]>;
- def F15_32 : RISCVReg32<15,"f15", ["fa5"]>, DwarfRegNum<[47]>;
- def F16_32 : RISCVReg32<16,"f16", ["fa6"]>, DwarfRegNum<[48]>;
- def F17_32 : RISCVReg32<17,"f17", ["fa7"]>, DwarfRegNum<[49]>;
- def F18_32 : RISCVReg32<18,"f18", ["fs2"]>, DwarfRegNum<[50]>;
- def F19_32 : RISCVReg32<19,"f19", ["fs3"]>, DwarfRegNum<[51]>;
- def F20_32 : RISCVReg32<20,"f20", ["fs4"]>, DwarfRegNum<[52]>;
- def F21_32 : RISCVReg32<21,"f21", ["fs5"]>, DwarfRegNum<[53]>;
- def F22_32 : RISCVReg32<22,"f22", ["fs6"]>, DwarfRegNum<[54]>;
- def F23_32 : RISCVReg32<23,"f23", ["fs7"]>, DwarfRegNum<[55]>;
- def F24_32 : RISCVReg32<24,"f24", ["fs8"]>, DwarfRegNum<[56]>;
- def F25_32 : RISCVReg32<25,"f25", ["fs9"]>, DwarfRegNum<[57]>;
- def F26_32 : RISCVReg32<26,"f26", ["fs10"]>, DwarfRegNum<[58]>;
- def F27_32 : RISCVReg32<27,"f27", ["fs11"]>, DwarfRegNum<[59]>;
- def F28_32 : RISCVReg32<28,"f28", ["ft8"]>, DwarfRegNum<[60]>;
- def F29_32 : RISCVReg32<29,"f29", ["ft9"]>, DwarfRegNum<[61]>;
- def F30_32 : RISCVReg32<30,"f30", ["ft10"]>, DwarfRegNum<[62]>;
- def F31_32 : RISCVReg32<31,"f31", ["ft11"]>, DwarfRegNum<[63]>;
+ def F0_F : RISCVReg32<0, "f0", ["ft0"]>, DwarfRegNum<[32]>;
+ def F1_F : RISCVReg32<1, "f1", ["ft1"]>, DwarfRegNum<[33]>;
+ def F2_F : RISCVReg32<2, "f2", ["ft2"]>, DwarfRegNum<[34]>;
+ def F3_F : RISCVReg32<3, "f3", ["ft3"]>, DwarfRegNum<[35]>;
+ def F4_F : RISCVReg32<4, "f4", ["ft4"]>, DwarfRegNum<[36]>;
+ def F5_F : RISCVReg32<5, "f5", ["ft5"]>, DwarfRegNum<[37]>;
+ def F6_F : RISCVReg32<6, "f6", ["ft6"]>, DwarfRegNum<[38]>;
+ def F7_F : RISCVReg32<7, "f7", ["ft7"]>, DwarfRegNum<[39]>;
+ def F8_F : RISCVReg32<8, "f8", ["fs0"]>, DwarfRegNum<[40]>;
+ def F9_F : RISCVReg32<9, "f9", ["fs1"]>, DwarfRegNum<[41]>;
+ def F10_F : RISCVReg32<10,"f10", ["fa0"]>, DwarfRegNum<[42]>;
+ def F11_F : RISCVReg32<11,"f11", ["fa1"]>, DwarfRegNum<[43]>;
+ def F12_F : RISCVReg32<12,"f12", ["fa2"]>, DwarfRegNum<[44]>;
+ def F13_F : RISCVReg32<13,"f13", ["fa3"]>, DwarfRegNum<[45]>;
+ def F14_F : RISCVReg32<14,"f14", ["fa4"]>, DwarfRegNum<[46]>;
+ def F15_F : RISCVReg32<15,"f15", ["fa5"]>, DwarfRegNum<[47]>;
+ def F16_F : RISCVReg32<16,"f16", ["fa6"]>, DwarfRegNum<[48]>;
+ def F17_F : RISCVReg32<17,"f17", ["fa7"]>, DwarfRegNum<[49]>;
+ def F18_F : RISCVReg32<18,"f18", ["fs2"]>, DwarfRegNum<[50]>;
+ def F19_F : RISCVReg32<19,"f19", ["fs3"]>, DwarfRegNum<[51]>;
+ def F20_F : RISCVReg32<20,"f20", ["fs4"]>, DwarfRegNum<[52]>;
+ def F21_F : RISCVReg32<21,"f21", ["fs5"]>, DwarfRegNum<[53]>;
+ def F22_F : RISCVReg32<22,"f22", ["fs6"]>, DwarfRegNum<[54]>;
+ def F23_F : RISCVReg32<23,"f23", ["fs7"]>, DwarfRegNum<[55]>;
+ def F24_F : RISCVReg32<24,"f24", ["fs8"]>, DwarfRegNum<[56]>;
+ def F25_F : RISCVReg32<25,"f25", ["fs9"]>, DwarfRegNum<[57]>;
+ def F26_F : RISCVReg32<26,"f26", ["fs10"]>, DwarfRegNum<[58]>;
+ def F27_F : RISCVReg32<27,"f27", ["fs11"]>, DwarfRegNum<[59]>;
+ def F28_F : RISCVReg32<28,"f28", ["ft8"]>, DwarfRegNum<[60]>;
+ def F29_F : RISCVReg32<29,"f29", ["ft9"]>, DwarfRegNum<[61]>;
+ def F30_F : RISCVReg32<30,"f30", ["ft10"]>, DwarfRegNum<[62]>;
+ def F31_F : RISCVReg32<31,"f31", ["ft11"]>, DwarfRegNum<[63]>;
foreach Index = 0-31 in {
- def F#Index#_64 : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_32")>,
+ def F#Index#_D : RISCVReg64<!cast<RISCVReg32>("F"#Index#"_F")>,
DwarfRegNum<[!add(Index, 32)]>;
}
}
@@ -201,29 +207,29 @@ let RegAltNameIndices = [ABIRegAltName] in {
// The order of registers represents the preferred allocation sequence,
// meaning caller-save regs are listed before callee-save.
def FPR32 : RegisterClass<"RISCV", [f32], 32, (add
- (sequence "F%u_32", 0, 7),
- (sequence "F%u_32", 10, 17),
- (sequence "F%u_32", 28, 31),
- (sequence "F%u_32", 8, 9),
- (sequence "F%u_32", 18, 27)
+ (sequence "F%u_F", 0, 7),
+ (sequence "F%u_F", 10, 17),
+ (sequence "F%u_F", 28, 31),
+ (sequence "F%u_F", 8, 9),
+ (sequence "F%u_F", 18, 27)
)>;
def FPR32C : RegisterClass<"RISCV", [f32], 32, (add
- (sequence "F%u_32", 10, 15),
- (sequence "F%u_32", 8, 9)
+ (sequence "F%u_F", 10, 15),
+ (sequence "F%u_F", 8, 9)
)>;
// The order of registers represents the preferred allocation sequence,
// meaning caller-save regs are listed before callee-save.
def FPR64 : RegisterClass<"RISCV", [f64], 64, (add
- (sequence "F%u_64", 0, 7),
- (sequence "F%u_64", 10, 17),
- (sequence "F%u_64", 28, 31),
- (sequence "F%u_64", 8, 9),
- (sequence "F%u_64", 18, 27)
+ (sequence "F%u_D", 0, 7),
+ (sequence "F%u_D", 10, 17),
+ (sequence "F%u_D", 28, 31),
+ (sequence "F%u_D", 8, 9),
+ (sequence "F%u_D", 18, 27)
)>;
def FPR64C : RegisterClass<"RISCV", [f64], 64, (add
- (sequence "F%u_64", 10, 15),
- (sequence "F%u_64", 8, 9)
+ (sequence "F%u_D", 10, 15),
+ (sequence "F%u_D", 8, 9)
)>;
diff --git a/lib/Target/RISCV/RISCVSubtarget.cpp b/lib/Target/RISCV/RISCVSubtarget.cpp
index 6902ed75d852..f114c6ac1925 100644
--- a/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -12,7 +12,11 @@
#include "RISCVSubtarget.h"
#include "RISCV.h"
+#include "RISCVCallLowering.h"
#include "RISCVFrameLowering.h"
+#include "RISCVLegalizerInfo.h"
+#include "RISCVRegisterBankInfo.h"
+#include "RISCVTargetMachine.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
@@ -47,4 +51,28 @@ RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
StringRef ABIName, const TargetMachine &TM)
: RISCVGenSubtargetInfo(TT, CPU, FS),
FrameLowering(initializeSubtargetDependencies(TT, CPU, FS, ABIName)),
- InstrInfo(), RegInfo(getHwMode()), TLInfo(TM, *this) {}
+ InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {
+ CallLoweringInfo.reset(new RISCVCallLowering(*getTargetLowering()));
+ Legalizer.reset(new RISCVLegalizerInfo(*this));
+
+ auto *RBI = new RISCVRegisterBankInfo(*getRegisterInfo());
+ RegBankInfo.reset(RBI);
+ InstSelector.reset(createRISCVInstructionSelector(
+ *static_cast<const RISCVTargetMachine *>(&TM), *this, *RBI));
+}
+
+const CallLowering *RISCVSubtarget::getCallLowering() const {
+ return CallLoweringInfo.get();
+}
+
+InstructionSelector *RISCVSubtarget::getInstructionSelector() const {
+ return InstSelector.get();
+}
+
+const LegalizerInfo *RISCVSubtarget::getLegalizerInfo() const {
+ return Legalizer.get();
+}
+
+const RegisterBankInfo *RISCVSubtarget::getRegBankInfo() const {
+ return RegBankInfo.get();
+}
diff --git a/lib/Target/RISCV/RISCVSubtarget.h b/lib/Target/RISCV/RISCVSubtarget.h
index 106ff49f021a..7d0373a5253a 100644
--- a/lib/Target/RISCV/RISCVSubtarget.h
+++ b/lib/Target/RISCV/RISCVSubtarget.h
@@ -17,6 +17,10 @@
#include "RISCVISelLowering.h"
#include "RISCVInstrInfo.h"
#include "Utils/RISCVBaseInfo.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DataLayout.h"
@@ -38,6 +42,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
bool HasRV64 = false;
bool IsRV32E = false;
bool EnableLinkerRelax = false;
+ bool EnableRVCHintInstrs = false;
unsigned XLen = 32;
MVT XLenVT = MVT::i32;
RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown;
@@ -75,6 +80,7 @@ public:
const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
return &TSInfo;
}
+ bool enableMachineScheduler() const override { return true; }
bool hasStdExtM() const { return HasStdExtM; }
bool hasStdExtA() const { return HasStdExtA; }
bool hasStdExtF() const { return HasStdExtF; }
@@ -83,9 +89,23 @@ public:
bool is64Bit() const { return HasRV64; }
bool isRV32E() const { return IsRV32E; }
bool enableLinkerRelax() const { return EnableLinkerRelax; }
+ bool enableRVCHintInstrs() const { return EnableRVCHintInstrs; }
MVT getXLenVT() const { return XLenVT; }
unsigned getXLen() const { return XLen; }
RISCVABI::ABI getTargetABI() const { return TargetABI; }
+
+protected:
+ // GlobalISel related APIs.
+ std::unique_ptr<CallLowering> CallLoweringInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
+public:
+ const CallLowering *getCallLowering() const override;
+ InstructionSelector *getInstructionSelector() const override;
+ const LegalizerInfo *getLegalizerInfo() const override;
+ const RegisterBankInfo *getRegBankInfo() const override;
};
} // End llvm namespace
diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp
index f4e6ed9f6284..5ffc6eda6bd7 100644
--- a/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -17,6 +17,10 @@
#include "TargetInfo/RISCVTargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -30,6 +34,7 @@ extern "C" void LLVMInitializeRISCVTarget() {
RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
auto PR = PassRegistry::getPassRegistry();
+ initializeGlobalISel(*PR);
initializeRISCVExpandPseudoPass(*PR);
}
@@ -58,7 +63,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
: LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
getEffectiveRelocModel(TT, RM),
getEffectiveCodeModel(CM, CodeModel::Small), OL),
- TLOF(make_unique<RISCVELFTargetObjectFile>()),
+ TLOF(std::make_unique<RISCVELFTargetObjectFile>()),
Subtarget(TT, CPU, FS, Options.MCOptions.getABIName(), *this) {
initAsmInfo();
}
@@ -80,6 +85,10 @@ public:
void addIRPasses() override;
bool addInstSelector() override;
+ bool addIRTranslator() override;
+ bool addLegalizeMachineIR() override;
+ bool addRegBankSelect() override;
+ bool addGlobalInstructionSelect() override;
void addPreEmitPass() override;
void addPreEmitPass2() override;
void addPreRegAlloc() override;
@@ -101,6 +110,26 @@ bool RISCVPassConfig::addInstSelector() {
return false;
}
+bool RISCVPassConfig::addIRTranslator() {
+ addPass(new IRTranslator());
+ return false;
+}
+
+bool RISCVPassConfig::addLegalizeMachineIR() {
+ addPass(new Legalizer());
+ return false;
+}
+
+bool RISCVPassConfig::addRegBankSelect() {
+ addPass(new RegBankSelect());
+ return false;
+}
+
+bool RISCVPassConfig::addGlobalInstructionSelect() {
+ addPass(new InstructionSelect());
+ return false;
+}
+
void RISCVPassConfig::addPreEmitPass() { addPass(&BranchRelaxationPassID); }
void RISCVPassConfig::addPreEmitPass2() {
diff --git a/lib/Target/RISCV/Utils/RISCVBaseInfo.h b/lib/Target/RISCV/Utils/RISCVBaseInfo.h
index c33c72f24319..30e475e80a01 100644
--- a/lib/Target/RISCV/Utils/RISCVBaseInfo.h
+++ b/lib/Target/RISCV/Utils/RISCVBaseInfo.h
@@ -16,6 +16,7 @@
#include "MCTargetDesc/RISCVMCTargetDesc.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/SubtargetFeature.h"
namespace llvm {
@@ -63,6 +64,21 @@ enum {
};
} // namespace RISCVII
+namespace RISCVOp {
+enum OperandType : unsigned {
+ OPERAND_FIRST_RISCV_IMM = MCOI::OPERAND_FIRST_TARGET,
+ OPERAND_UIMM4 = OPERAND_FIRST_RISCV_IMM,
+ OPERAND_UIMM5,
+ OPERAND_UIMM12,
+ OPERAND_SIMM12,
+ OPERAND_SIMM13_LSB0,
+ OPERAND_UIMM20,
+ OPERAND_SIMM21_LSB0,
+ OPERAND_UIMMLOG2XLEN,
+ OPERAND_LAST_RISCV_IMM = OPERAND_UIMMLOG2XLEN
+};
+} // namespace RISCVOp
+
// Describes the predecessor/successor bits used in the FENCE instruction.
namespace RISCVFenceField {
enum FenceField {
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 15453ae59a4f..f6be9dd01249 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -376,7 +376,7 @@ public:
}
static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) {
- auto Op = make_unique<SparcOperand>(k_Token);
+ auto Op = std::make_unique<SparcOperand>(k_Token);
Op->Tok.Data = Str.data();
Op->Tok.Length = Str.size();
Op->StartLoc = S;
@@ -386,7 +386,7 @@ public:
static std::unique_ptr<SparcOperand> CreateReg(unsigned RegNum, unsigned Kind,
SMLoc S, SMLoc E) {
- auto Op = make_unique<SparcOperand>(k_Register);
+ auto Op = std::make_unique<SparcOperand>(k_Register);
Op->Reg.RegNum = RegNum;
Op->Reg.Kind = (SparcOperand::RegisterKind)Kind;
Op->StartLoc = S;
@@ -396,7 +396,7 @@ public:
static std::unique_ptr<SparcOperand> CreateImm(const MCExpr *Val, SMLoc S,
SMLoc E) {
- auto Op = make_unique<SparcOperand>(k_Immediate);
+ auto Op = std::make_unique<SparcOperand>(k_Immediate);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
@@ -481,7 +481,7 @@ public:
static std::unique_ptr<SparcOperand>
CreateMEMr(unsigned Base, SMLoc S, SMLoc E) {
- auto Op = make_unique<SparcOperand>(k_MemoryReg);
+ auto Op = std::make_unique<SparcOperand>(k_MemoryReg);
Op->Mem.Base = Base;
Op->Mem.OffsetReg = Sparc::G0; // always 0
Op->Mem.Off = nullptr;
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index f1ca8e18c228..db8e7850300f 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -253,7 +253,7 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
if (!MO.isReg())
continue; // skip
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (MO.isDef()) {
// check whether Reg is defined or used before delay slot.
@@ -324,7 +324,7 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg == 0)
continue;
if (MO.isDef())
@@ -380,7 +380,7 @@ static bool combineRestoreADD(MachineBasicBlock::iterator RestoreMI,
//
// After : restore <op0>, <op1>, %o[0-7]
- unsigned reg = AddMI->getOperand(0).getReg();
+ Register reg = AddMI->getOperand(0).getReg();
if (reg < SP::I0 || reg > SP::I7)
return false;
@@ -408,7 +408,7 @@ static bool combineRestoreOR(MachineBasicBlock::iterator RestoreMI,
//
// After : restore <op0>, <op1>, %o[0-7]
- unsigned reg = OrMI->getOperand(0).getReg();
+ Register reg = OrMI->getOperand(0).getReg();
if (reg < SP::I0 || reg > SP::I7)
return false;
@@ -446,7 +446,7 @@ static bool combineRestoreSETHIi(MachineBasicBlock::iterator RestoreMI,
//
// After : restore %g0, (imm3<<10), %o[0-7]
- unsigned reg = SetHiMI->getOperand(0).getReg();
+ Register reg = SetHiMI->getOperand(0).getReg();
if (reg < SP::I0 || reg > SP::I7)
return false;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index 88547075c5ae..c97a30e634cc 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -49,7 +49,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
}
if (IsPCRel) {
- switch((unsigned)Fixup.getKind()) {
+ switch(Fixup.getTargetKind()) {
default:
llvm_unreachable("Unimplemented fixup -> relocation");
case FK_Data_1: return ELF::R_SPARC_DISP8;
@@ -65,7 +65,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
}
}
- switch((unsigned)Fixup.getKind()) {
+ switch(Fixup.getTargetKind()) {
default:
llvm_unreachable("Unimplemented fixup -> relocation");
case FK_Data_1: return ELF::R_SPARC_8;
@@ -135,5 +135,5 @@ bool SparcELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
std::unique_ptr<MCObjectTargetWriter>
llvm::createSparcELFObjectWriter(bool Is64Bit, uint8_t OSABI) {
- return llvm::make_unique<SparcELFObjectWriter>(Is64Bit, OSABI);
+ return std::make_unique<SparcELFObjectWriter>(Is64Bit, OSABI);
}
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 1834a6fd861d..0f74f2bb344c 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -34,7 +34,8 @@ DisableLeafProc("disable-sparc-leaf-proc",
SparcFrameLowering::SparcFrameLowering(const SparcSubtarget &ST)
: TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
- ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8) {}
+ ST.is64Bit() ? Align(16) : Align(8), 0,
+ ST.is64Bit() ? Align(16) : Align(8)) {}
void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF,
MachineBasicBlock &MBB,
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index 8cff50d19ed4..4e61c341b703 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -231,7 +231,7 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){
// Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
// the original GPRs.
- unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+ Register GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
SDValue Chain = SDValue(N,0);
@@ -278,7 +278,7 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){
// Copy REG_SEQ into a GPRPair-typed VR and replace the original two
// i32 VRs of inline asm with it.
- unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+ Register GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index a6d440fa8aa2..4a2ba00ac6c2 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -417,7 +417,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_32(
if (VA.needsCustom()) {
assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
- unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+ Register VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi);
SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
@@ -445,7 +445,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_32(
InVals.push_back(WholeValue);
continue;
}
- unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+ Register VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
MF.getRegInfo().addLiveIn(VA.getLocReg(), VReg);
SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
if (VA.getLocVT() == MVT::f32)
@@ -552,7 +552,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_32(
std::vector<SDValue> OutChains;
for (; CurArgReg != ArgRegEnd; ++CurArgReg) {
- unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+ Register VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
MF.getRegInfo().addLiveIn(*CurArgReg, VReg);
SDValue Arg = DAG.getCopyFromReg(DAG.getRoot(), dl, VReg, MVT::i32);
@@ -1016,9 +1016,9 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-unsigned SparcTargetLowering::getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const {
- unsigned Reg = StringSwitch<unsigned>(RegName)
+Register SparcTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const {
+ Register Reg = StringSwitch<unsigned>(RegName)
.Case("i0", SP::I0).Case("i1", SP::I1).Case("i2", SP::I2).Case("i3", SP::I3)
.Case("i4", SP::I4).Case("i5", SP::I5).Case("i6", SP::I6).Case("i7", SP::I7)
.Case("o0", SP::O0).Case("o1", SP::O1).Case("o2", SP::O2).Case("o3", SP::O3)
@@ -1438,7 +1438,7 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
setOperationAction(Op, MVT::v2i32, Expand);
}
// Truncating/extending stores/loads are also not supported.
- for (MVT VT : MVT::integer_vector_valuetypes()) {
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i32, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Expand);
@@ -1805,7 +1805,7 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
- setMinFunctionAlignment(2);
+ setMinFunctionAlignment(Align(4));
computeRegisterProperties(Subtarget->getRegisterInfo());
}
@@ -2244,7 +2244,7 @@ SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
}
case SPCC::FCC_UL : {
- SDValue Mask = DAG.getTargetConstant(1, DL, Result.getValueType());
+ SDValue Mask = DAG.getConstant(1, DL, Result.getValueType());
Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
SPCC = SPCC::ICC_NE;
@@ -2277,14 +2277,14 @@ SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
}
case SPCC::FCC_LG : {
- SDValue Mask = DAG.getTargetConstant(3, DL, Result.getValueType());
+ SDValue Mask = DAG.getConstant(3, DL, Result.getValueType());
Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
SPCC = SPCC::ICC_NE;
return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
}
case SPCC::FCC_UE : {
- SDValue Mask = DAG.getTargetConstant(3, DL, Result.getValueType());
+ SDValue Mask = DAG.getConstant(3, DL, Result.getValueType());
Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
SPCC = SPCC::ICC_E;
@@ -2951,9 +2951,11 @@ static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
SDValue HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
+ TargetLowering::MakeLibCallOptions CallOptions;
+ CallOptions.setSExt(isSigned);
SDValue MulResult = TLI.makeLibCall(DAG,
RTLIB::MUL_I128, WideVT,
- Args, isSigned, dl).first;
+ Args, CallOptions, dl).first;
SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
MulResult, DAG.getIntPtrConstant(0, dl));
SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
@@ -3183,7 +3185,7 @@ SparcTargetLowering::getConstraintType(StringRef Constraint) const {
case 'e':
return C_RegisterClass;
case 'I': // SIMM13
- return C_Other;
+ return C_Immediate;
}
}
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 8d557a4225e5..3d798cec0c16 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -98,8 +98,8 @@ namespace llvm {
return MVT::i32;
}
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.
diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td
index 2d4f687f72d2..d18ab3b1370b 100644
--- a/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/lib/Target/Sparc/SparcInstr64Bit.td
@@ -177,7 +177,7 @@ def LEAX_ADDri : F3_2<2, 0b000000,
def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>;
def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (CMPri $a, (as_i32imm $b))>;
-def : Pat<(ctpop i64:$src), (POPCrr $src)>;
+def : Pat<(i64 (ctpop i64:$src)), (POPCrr $src)>;
} // Predicates = [Is64Bit]
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index ad343fe6f80a..3d3d314a26bb 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -375,8 +375,8 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineInstr *MovMI = nullptr;
for (unsigned i = 0; i != numSubRegs; ++i) {
- unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
- unsigned Src = TRI->getSubReg(SrcReg, subRegIdx[i]);
+ Register Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
+ Register Src = TRI->getSubReg(SrcReg, subRegIdx[i]);
assert(Dst && Src && "Bad sub-register");
MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(movOpc), Dst);
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 8474c7abffb3..73dbdc4f443e 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -516,9 +516,9 @@ let DecoderMethod = "DecodeLoadQFP" in
defm LDQF : LoadA<"ldq", 0b100010, 0b110010, load, QFPRegs, f128>,
Requires<[HasV9, HasHardQuad]>;
-let DecoderMethod = "DecodeLoadCP" in
- defm LDC : Load<"ld", 0b110000, load, CoprocRegs, i32>;
-let DecoderMethod = "DecodeLoadCPPair" in
+let DecoderMethod = "DecodeLoadCP" in
+ defm LDC : Load<"ld", 0b110000, load, CoprocRegs, i32>;
+let DecoderMethod = "DecodeLoadCPPair" in
defm LDDC : Load<"ldd", 0b110011, load, CoprocPair, v2i32, IIC_ldd>;
let DecoderMethod = "DecodeLoadCP", Defs = [CPSR] in {
@@ -1508,7 +1508,7 @@ let rs1 = 0 in
def POPCrr : F3_1<2, 0b101110,
(outs IntRegs:$rd), (ins IntRegs:$rs2),
"popc $rs2, $rd", []>, Requires<[HasV9]>;
-def : Pat<(ctpop i32:$src),
+def : Pat<(i32 (ctpop i32:$src)),
(POPCrr (SRLri $src, 0))>;
let Predicates = [HasV9], hasSideEffects = 1, rd = 0, rs1 = 0b01111 in
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index ce11a423d10e..19a90e98db7e 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -182,9 +182,9 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) {
if (MI.getOpcode() == SP::STQFri) {
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
- unsigned SrcReg = MI.getOperand(2).getReg();
- unsigned SrcEvenReg = getSubReg(SrcReg, SP::sub_even64);
- unsigned SrcOddReg = getSubReg(SrcReg, SP::sub_odd64);
+ Register SrcReg = MI.getOperand(2).getReg();
+ Register SrcEvenReg = getSubReg(SrcReg, SP::sub_even64);
+ Register SrcOddReg = getSubReg(SrcReg, SP::sub_odd64);
MachineInstr *StMI =
BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri))
.addReg(FrameReg).addImm(0).addReg(SrcEvenReg);
@@ -194,9 +194,9 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
Offset += 8;
} else if (MI.getOpcode() == SP::LDQFri) {
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64);
- unsigned DestOddReg = getSubReg(DestReg, SP::sub_odd64);
+ Register DestReg = MI.getOperand(0).getReg();
+ Register DestEvenReg = getSubReg(DestReg, SP::sub_even64);
+ Register DestOddReg = getSubReg(DestReg, SP::sub_odd64);
MachineInstr *LdMI =
BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg)
.addReg(FrameReg).addImm(0);
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 195cff79de03..c1e3f8c36982 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -98,7 +98,7 @@ SparcTargetMachine::SparcTargetMachine(
getEffectiveSparcCodeModel(
CM, getEffectiveRelocModel(RM), is64bit, JIT),
OL),
- TLOF(make_unique<SparcELFTargetObjectFile>()),
+ TLOF(std::make_unique<SparcELFTargetObjectFile>()),
Subtarget(TT, CPU, FS, *this, is64bit), is64Bit(is64bit) {
initAsmInfo();
}
@@ -133,7 +133,7 @@ SparcTargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<SparcSubtarget>(TargetTriple, CPU, FS, *this,
+ I = std::make_unique<SparcSubtarget>(TargetTriple, CPU, FS, *this,
this->is64Bit);
}
return I.get();
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index a259ba3433d6..93c4ce4b5ccc 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -155,11 +155,11 @@ public:
// Create particular kinds of operand.
static std::unique_ptr<SystemZOperand> createInvalid(SMLoc StartLoc,
SMLoc EndLoc) {
- return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
+ return std::make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
}
static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) {
- auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc);
+ auto Op = std::make_unique<SystemZOperand>(KindToken, Loc, Loc);
Op->Token.Data = Str.data();
Op->Token.Length = Str.size();
return Op;
@@ -167,7 +167,7 @@ public:
static std::unique_ptr<SystemZOperand>
createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
- auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
+ auto Op = std::make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
Op->Reg.Kind = Kind;
Op->Reg.Num = Num;
return Op;
@@ -175,7 +175,7 @@ public:
static std::unique_ptr<SystemZOperand>
createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) {
- auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
+ auto Op = std::make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
Op->Imm = Expr;
return Op;
}
@@ -184,7 +184,7 @@ public:
createMem(MemoryKind MemKind, RegisterKind RegKind, unsigned Base,
const MCExpr *Disp, unsigned Index, const MCExpr *LengthImm,
unsigned LengthReg, SMLoc StartLoc, SMLoc EndLoc) {
- auto Op = make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc);
+ auto Op = std::make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc);
Op->Mem.MemKind = MemKind;
Op->Mem.RegKind = RegKind;
Op->Mem.Base = Base;
@@ -200,7 +200,7 @@ public:
static std::unique_ptr<SystemZOperand>
createImmTLS(const MCExpr *Imm, const MCExpr *Sym,
SMLoc StartLoc, SMLoc EndLoc) {
- auto Op = make_unique<SystemZOperand>(KindImmTLS, StartLoc, EndLoc);
+ auto Op = std::make_unique<SystemZOperand>(KindImmTLS, StartLoc, EndLoc);
Op->ImmTLS.Imm = Imm;
Op->ImmTLS.Sym = Sym;
return Op;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 8d8ba5644e10..49b6fc490336 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -162,5 +162,5 @@ unsigned SystemZObjectWriter::getRelocType(MCContext &Ctx,
std::unique_ptr<MCObjectTargetWriter>
llvm::createSystemZObjectWriter(uint8_t OSABI) {
- return llvm::make_unique<SystemZObjectWriter>(OSABI);
+ return std::make_unique<SystemZObjectWriter>(OSABI);
}
diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
index 2b0f90182d7f..88cf589a3f10 100644
--- a/lib/Target/SystemZ/SystemZ.h
+++ b/lib/Target/SystemZ/SystemZ.h
@@ -190,7 +190,6 @@ static inline bool isImmHF(uint64_t Val) {
FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
CodeGenOpt::Level OptLevel);
FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
-FunctionPass *createSystemZExpandPseudoPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index ef378e4ade7a..10023e9e169c 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -501,6 +501,10 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
break;
+ case TargetOpcode::FENTRY_CALL:
+ LowerFENTRY_CALL(*MI, Lower);
+ return;
+
case TargetOpcode::STACKMAP:
LowerSTACKMAP(*MI);
return;
@@ -546,6 +550,22 @@ static unsigned EmitNop(MCContext &OutContext, MCStreamer &OutStreamer,
}
}
+void SystemZAsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
+ SystemZMCInstLower &Lower) {
+ MCContext &Ctx = MF->getContext();
+ if (MF->getFunction().getFnAttribute("mnop-mcount")
+ .getValueAsString() == "true") {
+ EmitNop(Ctx, *OutStreamer, 6, getSubtargetInfo());
+ return;
+ }
+
+ MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__");
+ const MCSymbolRefExpr *Op =
+ MCSymbolRefExpr::create(fentry, MCSymbolRefExpr::VK_PLT, Ctx);
+ OutStreamer->EmitInstruction(MCInstBuilder(SystemZ::BRASL)
+ .addReg(SystemZ::R0D).addExpr(Op), getSubtargetInfo());
+}
+
void SystemZAsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(MF->getSubtarget().getInstrInfo());
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.h b/lib/Target/SystemZ/SystemZAsmPrinter.h
index aa5d3ca78e61..d01a17c2ebe2 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -46,6 +46,7 @@ public:
}
private:
+ void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL);
void LowerSTACKMAP(const MachineInstr &MI);
void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower);
};
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index 9cbf6b320504..946eb2ba7c79 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -152,7 +152,7 @@ Reference SystemZElimCompare::getRegReferences(MachineInstr &MI, unsigned Reg) {
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
const MachineOperand &MO = MI.getOperand(I);
if (MO.isReg()) {
- if (unsigned MOReg = MO.getReg()) {
+ if (Register MOReg = MO.getReg()) {
if (TRI->regsOverlap(MOReg, Reg)) {
if (MO.isUse())
Ref.Use = true;
@@ -378,11 +378,8 @@ bool SystemZElimCompare::adjustCCMasksForInstr(
}
// CC is now live after MI.
- if (!ConvOpc) {
- int CCDef = MI.findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI);
- assert(CCDef >= 0 && "Couldn't find CC set");
- MI.getOperand(CCDef).setIsDead(false);
- }
+ if (!ConvOpc)
+ MI.clearRegisterDeads(SystemZ::CC);
// Check if MI lies before Compare.
bool BeforeCmp = false;
diff --git a/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/lib/Target/SystemZ/SystemZExpandPseudo.cpp
deleted file mode 100644
index 09708fb4241c..000000000000
--- a/lib/Target/SystemZ/SystemZExpandPseudo.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//==-- SystemZExpandPseudo.cpp - Expand pseudo instructions -------*- C++ -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that expands pseudo instructions into target
-// instructions to allow proper scheduling and other late optimizations. This
-// pass should be run after register allocation but before the post-regalloc
-// scheduling pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "SystemZ.h"
-#include "SystemZInstrInfo.h"
-#include "SystemZSubtarget.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-using namespace llvm;
-
-#define SYSTEMZ_EXPAND_PSEUDO_NAME "SystemZ pseudo instruction expansion pass"
-
-namespace llvm {
- void initializeSystemZExpandPseudoPass(PassRegistry&);
-}
-
-namespace {
-class SystemZExpandPseudo : public MachineFunctionPass {
-public:
- static char ID;
- SystemZExpandPseudo() : MachineFunctionPass(ID) {
- initializeSystemZExpandPseudoPass(*PassRegistry::getPassRegistry());
- }
-
- const SystemZInstrInfo *TII;
-
- bool runOnMachineFunction(MachineFunction &Fn) override;
-
- StringRef getPassName() const override { return SYSTEMZ_EXPAND_PSEUDO_NAME; }
-
-private:
- bool expandMBB(MachineBasicBlock &MBB);
- bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI);
- bool expandLOCRMux(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI);
-};
-char SystemZExpandPseudo::ID = 0;
-}
-
-INITIALIZE_PASS(SystemZExpandPseudo, "systemz-expand-pseudo",
- SYSTEMZ_EXPAND_PSEUDO_NAME, false, false)
-
-/// Returns an instance of the pseudo instruction expansion pass.
-FunctionPass *llvm::createSystemZExpandPseudoPass(SystemZTargetMachine &TM) {
- return new SystemZExpandPseudo();
-}
-
-// MI is a load-register-on-condition pseudo instruction that could not be
-// handled as a single hardware instruction. Replace it by a branch sequence.
-bool SystemZExpandPseudo::expandLOCRMux(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI) {
- MachineFunction &MF = *MBB.getParent();
- const BasicBlock *BB = MBB.getBasicBlock();
- MachineInstr &MI = *MBBI;
- DebugLoc DL = MI.getDebugLoc();
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(2).getReg();
- unsigned CCValid = MI.getOperand(3).getImm();
- unsigned CCMask = MI.getOperand(4).getImm();
-
- LivePhysRegs LiveRegs(TII->getRegisterInfo());
- LiveRegs.addLiveOuts(MBB);
- for (auto I = std::prev(MBB.end()); I != MBBI; --I)
- LiveRegs.stepBackward(*I);
-
- // Splice MBB at MI, moving the rest of the block into RestMBB.
- MachineBasicBlock *RestMBB = MF.CreateMachineBasicBlock(BB);
- MF.insert(std::next(MachineFunction::iterator(MBB)), RestMBB);
- RestMBB->splice(RestMBB->begin(), &MBB, MI, MBB.end());
- RestMBB->transferSuccessors(&MBB);
- for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
- RestMBB->addLiveIn(*I);
-
- // Create a new block MoveMBB to hold the move instruction.
- MachineBasicBlock *MoveMBB = MF.CreateMachineBasicBlock(BB);
- MF.insert(std::next(MachineFunction::iterator(MBB)), MoveMBB);
- MoveMBB->addLiveIn(SrcReg);
- for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
- MoveMBB->addLiveIn(*I);
-
- // At the end of MBB, create a conditional branch to RestMBB if the
- // condition is false, otherwise fall through to MoveMBB.
- BuildMI(&MBB, DL, TII->get(SystemZ::BRC))
- .addImm(CCValid).addImm(CCMask ^ CCValid).addMBB(RestMBB);
- MBB.addSuccessor(RestMBB);
- MBB.addSuccessor(MoveMBB);
-
- // In MoveMBB, emit an instruction to move SrcReg into DestReg,
- // then fall through to RestMBB.
- TII->copyPhysReg(*MoveMBB, MoveMBB->end(), DL, DestReg, SrcReg,
- MI.getOperand(2).isKill());
- MoveMBB->addSuccessor(RestMBB);
-
- NextMBBI = MBB.end();
- MI.eraseFromParent();
- return true;
-}
-
-/// If MBBI references a pseudo instruction that should be expanded here,
-/// do the expansion and return true. Otherwise return false.
-bool SystemZExpandPseudo::expandMI(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI) {
- MachineInstr &MI = *MBBI;
- switch (MI.getOpcode()) {
- case SystemZ::LOCRMux:
- return expandLOCRMux(MBB, MBBI, NextMBBI);
- default:
- break;
- }
- return false;
-}
-
-/// Iterate over the instructions in basic block MBB and expand any
-/// pseudo instructions. Return true if anything was modified.
-bool SystemZExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
- bool Modified = false;
-
- MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
- while (MBBI != E) {
- MachineBasicBlock::iterator NMBBI = std::next(MBBI);
- Modified |= expandMI(MBB, MBBI, NMBBI);
- MBBI = NMBBI;
- }
-
- return Modified;
-}
-
-bool SystemZExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
- TII = static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
-
- bool Modified = false;
- for (auto &MBB : MF)
- Modified |= expandMBB(MBB);
- return Modified;
-}
-
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index da28faebb326..0b8b6880accc 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -46,8 +46,8 @@ static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
} // end anonymous namespace
SystemZFrameLowering::SystemZFrameLowering()
- : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8,
- -SystemZMC::CallFrameSize, 8,
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(8),
+ -SystemZMC::CallFrameSize, Align(8),
false /* StackRealignable */) {
// Create a mapping from register number to save slot offset.
RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
@@ -118,7 +118,7 @@ static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
unsigned GPR64, bool IsImplicit) {
const TargetRegisterInfo *RI =
MBB.getParent()->getSubtarget().getRegisterInfo();
- unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32);
+ Register GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32);
bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32);
if (!IsLive || !IsImplicit) {
MIB.addReg(GPR64, getImplRegState(IsImplicit) | getKillRegState(!IsLive));
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 9dc4512255cc..751034c2d41a 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -346,6 +346,11 @@ public:
: SelectionDAGISel(TM, OptLevel) {}
bool runOnMachineFunction(MachineFunction &MF) override {
+ const Function &F = MF.getFunction();
+ if (F.getFnAttribute("mnop-mcount").getValueAsString() == "true" &&
+ F.getFnAttribute("fentry-call").getValueAsString() != "true")
+ report_fatal_error("mnop-mcount only supported with fentry-call");
+
Subtarget = &MF.getSubtarget<SystemZSubtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -1146,7 +1151,7 @@ void SystemZDAGToDAGISel::loadVectorConstant(
SDLoc DL(Node);
SmallVector<SDValue, 2> Ops;
for (unsigned OpVal : VCI.OpVals)
- Ops.push_back(CurDAG->getConstant(OpVal, DL, MVT::i32));
+ Ops.push_back(CurDAG->getTargetConstant(OpVal, DL, MVT::i32));
SDValue Op = CurDAG->getNode(VCI.Opcode, DL, VCI.VecVT, Ops);
if (VCI.VecVT == VT.getSimpleVT())
@@ -1550,8 +1555,8 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
uint64_t ConstCCMask =
cast<ConstantSDNode>(CCMask.getNode())->getZExtValue();
// Invert the condition.
- CCMask = CurDAG->getConstant(ConstCCValid ^ ConstCCMask, SDLoc(Node),
- CCMask.getValueType());
+ CCMask = CurDAG->getTargetConstant(ConstCCValid ^ ConstCCMask,
+ SDLoc(Node), CCMask.getValueType());
SDValue Op4 = Node->getOperand(4);
SDNode *UpdatedNode =
CurDAG->UpdateNodeOperands(Node, Op1, Op0, CCValid, CCMask, Op4);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 78820f511ab4..e0ca9da93561 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -120,9 +120,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
// Instructions are strings of 2-byte aligned 2-byte values.
- setMinFunctionAlignment(2);
+ setMinFunctionAlignment(Align(2));
// For performance reasons we prefer 16-byte alignment.
- setPrefFunctionAlignment(4);
+ setPrefFunctionAlignment(Align(16));
// Handle operations that are handled in a similar way for all types.
for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
@@ -206,6 +206,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// the default expansion.
if (!Subtarget.hasFPExtension())
setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+
+ // Mirror those settings for STRICT_FP_TO_[SU]INT. Note that these all
+ // default to Expand, so need to be modified to Legal where appropriate.
+ setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Legal);
+ if (Subtarget.hasFPExtension())
+ setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Legal);
}
}
@@ -252,7 +258,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote);
setOperationAction(ISD::CTLZ, MVT::i64, Legal);
- // On arch13 we have native support for a 64-bit CTPOP.
+ // On z15 we have native support for a 64-bit CTPOP.
if (Subtarget.hasMiscellaneousExtensions3()) {
setOperationAction(ISD::CTPOP, MVT::i32, Promote);
setOperationAction(ISD::CTPOP, MVT::i64, Legal);
@@ -294,14 +300,14 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// Handle prefetches with PFD or PFDRL.
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
// Assume by default that all vector operations need to be expanded.
for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode)
if (getOperationAction(Opcode, VT) == Legal)
setOperationAction(Opcode, VT, Expand);
// Likewise all truncating stores and extending loads.
- for (MVT InnerVT : MVT::vector_valuetypes()) {
+ for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
@@ -327,7 +333,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
}
// Handle integer vector types.
- for (MVT VT : MVT::integer_vector_valuetypes()) {
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
if (isTypeLegal(VT)) {
// These operations have direct equivalents.
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
@@ -381,6 +387,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
+
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i64, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i64, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f64, Legal);
}
if (Subtarget.hasVectorEnhancements2()) {
@@ -392,6 +403,11 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::v4f32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v4f32, Legal);
+
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f32, Legal);
}
// Handle floating-point types.
@@ -831,7 +847,7 @@ supportedAddressingMode(Instruction *I, bool HasVector) {
}
if (isa<LoadInst>(I) && I->hasOneUse()) {
- auto *SingleUser = dyn_cast<Instruction>(*I->user_begin());
+ auto *SingleUser = cast<Instruction>(*I->user_begin());
if (SingleUser->getParent() == I->getParent()) {
if (isa<ICmpInst>(SingleUser)) {
if (auto *C = dyn_cast<ConstantInt>(SingleUser->getOperand(1)))
@@ -956,7 +972,7 @@ SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
case 'K': // Signed 16-bit constant
case 'L': // Signed 20-bit displacement (on all targets we support)
case 'M': // 0x7fffffff
- return C_Other;
+ return C_Immediate;
default:
break;
@@ -1335,7 +1351,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
break;
}
- unsigned VReg = MRI.createVirtualRegister(RC);
+ Register VReg = MRI.createVirtualRegister(RC);
MRI.addLiveIn(VA.getLocReg(), VReg);
ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
} else {
@@ -1430,7 +1446,7 @@ static bool canUseSiblingCall(const CCState &ArgCCInfo,
return false;
if (!VA.isRegLoc())
return false;
- unsigned Reg = VA.getLocReg();
+ Register Reg = VA.getLocReg();
if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
return false;
if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftError())
@@ -1674,7 +1690,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
RetValue = convertValVTToLocVT(DAG, DL, VA, RetValue);
// Chain and glue the copies together.
- unsigned Reg = VA.getLocReg();
+ Register Reg = VA.getLocReg();
Chain = DAG.getCopyToReg(Chain, DL, Reg, RetValue, Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(Reg, VA.getLocVT()));
@@ -2533,12 +2549,12 @@ static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
}
if (C.Opcode == SystemZISD::ICMP)
return DAG.getNode(SystemZISD::ICMP, DL, MVT::i32, C.Op0, C.Op1,
- DAG.getConstant(C.ICmpType, DL, MVT::i32));
+ DAG.getTargetConstant(C.ICmpType, DL, MVT::i32));
if (C.Opcode == SystemZISD::TM) {
bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
return DAG.getNode(SystemZISD::TM, DL, MVT::i32, C.Op0, C.Op1,
- DAG.getConstant(RegisterOnly, DL, MVT::i32));
+ DAG.getTargetConstant(RegisterOnly, DL, MVT::i32));
}
return DAG.getNode(C.Opcode, DL, MVT::i32, C.Op0, C.Op1);
}
@@ -2576,10 +2592,10 @@ static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
// in CCValid, so other values can be ignored.
static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue CCReg,
unsigned CCValid, unsigned CCMask) {
- SDValue Ops[] = { DAG.getConstant(1, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(CCValid, DL, MVT::i32),
- DAG.getConstant(CCMask, DL, MVT::i32), CCReg };
+ SDValue Ops[] = {DAG.getConstant(1, DL, MVT::i32),
+ DAG.getConstant(0, DL, MVT::i32),
+ DAG.getTargetConstant(CCValid, DL, MVT::i32),
+ DAG.getTargetConstant(CCMask, DL, MVT::i32), CCReg};
return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, MVT::i32, Ops);
}
@@ -2741,9 +2757,10 @@ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
SDValue CCReg = emitCmp(DAG, DL, C);
- return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
- Op.getOperand(0), DAG.getConstant(C.CCValid, DL, MVT::i32),
- DAG.getConstant(C.CCMask, DL, MVT::i32), Dest, CCReg);
+ return DAG.getNode(
+ SystemZISD::BR_CCMASK, DL, Op.getValueType(), Op.getOperand(0),
+ DAG.getTargetConstant(C.CCValid, DL, MVT::i32),
+ DAG.getTargetConstant(C.CCMask, DL, MVT::i32), Dest, CCReg);
}
// Return true if Pos is CmpOp and Neg is the negative of CmpOp,
@@ -2794,8 +2811,9 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
}
SDValue CCReg = emitCmp(DAG, DL, C);
- SDValue Ops[] = {TrueOp, FalseOp, DAG.getConstant(C.CCValid, DL, MVT::i32),
- DAG.getConstant(C.CCMask, DL, MVT::i32), CCReg};
+ SDValue Ops[] = {TrueOp, FalseOp,
+ DAG.getTargetConstant(C.CCValid, DL, MVT::i32),
+ DAG.getTargetConstant(C.CCMask, DL, MVT::i32), CCReg};
return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, Op.getValueType(), Ops);
}
@@ -3882,11 +3900,8 @@ SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
- SDValue Ops[] = {
- Op.getOperand(0),
- DAG.getConstant(Code, DL, MVT::i32),
- Op.getOperand(1)
- };
+ SDValue Ops[] = {Op.getOperand(0), DAG.getTargetConstant(Code, DL, MVT::i32),
+ Op.getOperand(1)};
return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, DL,
Node->getVTList(), Ops,
Node->getMemoryVT(), Node->getMemOperand());
@@ -4228,7 +4243,7 @@ static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1);
SDValue Op;
if (P.Opcode == SystemZISD::PERMUTE_DWORDS) {
- SDValue Op2 = DAG.getConstant(P.Operand, DL, MVT::i32);
+ SDValue Op2 = DAG.getTargetConstant(P.Operand, DL, MVT::i32);
Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2);
} else if (P.Opcode == SystemZISD::PACK) {
MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8),
@@ -4253,7 +4268,8 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
unsigned StartIndex, OpNo0, OpNo1;
if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1))
return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0],
- Ops[OpNo1], DAG.getConstant(StartIndex, DL, MVT::i32));
+ Ops[OpNo1],
+ DAG.getTargetConstant(StartIndex, DL, MVT::i32));
// Fall back on VPERM. Construct an SDNode for the permute vector.
SDValue IndexNodes[SystemZ::VectorBytes];
@@ -4751,7 +4767,7 @@ SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0.getOperand(Index));
// Otherwise keep it as a vector-to-vector operation.
return DAG.getNode(SystemZISD::SPLAT, DL, VT, Op.getOperand(0),
- DAG.getConstant(Index, DL, MVT::i32));
+ DAG.getTargetConstant(Index, DL, MVT::i32));
}
GeneralShuffle GS(VT);
@@ -6041,8 +6057,8 @@ SDValue SystemZTargetLowering::combineBR_CCMASK(
if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
return DAG.getNode(SystemZISD::BR_CCMASK, SDLoc(N), N->getValueType(0),
Chain,
- DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32),
- DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32),
+ DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
+ DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32),
N->getOperand(3), CCReg);
return SDValue();
}
@@ -6063,10 +6079,9 @@ SDValue SystemZTargetLowering::combineSELECT_CCMASK(
if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
return DAG.getNode(SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0),
- N->getOperand(0),
- N->getOperand(1),
- DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32),
- DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32),
+ N->getOperand(0), N->getOperand(1),
+ DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
+ DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32),
CCReg);
return SDValue();
}
@@ -6548,19 +6563,17 @@ static bool isSelectPseudo(MachineInstr &MI) {
// Helper function, which inserts PHI functions into SinkMBB:
// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
-// where %FalseValue(i) and %TrueValue(i) are taken from the consequent Selects
-// in [MIItBegin, MIItEnd) range.
-static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
- MachineBasicBlock::iterator MIItEnd,
+// where %FalseValue(i) and %TrueValue(i) are taken from Selects.
+static void createPHIsForSelects(SmallVector<MachineInstr*, 8> &Selects,
MachineBasicBlock *TrueMBB,
MachineBasicBlock *FalseMBB,
MachineBasicBlock *SinkMBB) {
MachineFunction *MF = TrueMBB->getParent();
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
- unsigned CCValid = MIItBegin->getOperand(3).getImm();
- unsigned CCMask = MIItBegin->getOperand(4).getImm();
- DebugLoc DL = MIItBegin->getDebugLoc();
+ MachineInstr *FirstMI = Selects.front();
+ unsigned CCValid = FirstMI->getOperand(3).getImm();
+ unsigned CCMask = FirstMI->getOperand(4).getImm();
MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
@@ -6572,16 +6585,15 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
// destination registers, and the registers that went into the PHI.
DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
- for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;
- MIIt = skipDebugInstructionsForward(++MIIt, MIItEnd)) {
- unsigned DestReg = MIIt->getOperand(0).getReg();
- unsigned TrueReg = MIIt->getOperand(1).getReg();
- unsigned FalseReg = MIIt->getOperand(2).getReg();
+ for (auto MI : Selects) {
+ Register DestReg = MI->getOperand(0).getReg();
+ Register TrueReg = MI->getOperand(1).getReg();
+ Register FalseReg = MI->getOperand(2).getReg();
// If this Select we are generating is the opposite condition from
// the jump we generated, then we have to swap the operands for the
// PHI that is going to be generated.
- if (MIIt->getOperand(4).getImm() == (CCValid ^ CCMask))
+ if (MI->getOperand(4).getImm() == (CCValid ^ CCMask))
std::swap(TrueReg, FalseReg);
if (RegRewriteTable.find(TrueReg) != RegRewriteTable.end())
@@ -6590,6 +6602,7 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
if (RegRewriteTable.find(FalseReg) != RegRewriteTable.end())
FalseReg = RegRewriteTable[FalseReg].second;
+ DebugLoc DL = MI->getDebugLoc();
BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(SystemZ::PHI), DestReg)
.addReg(TrueReg).addMBB(TrueMBB)
.addReg(FalseReg).addMBB(FalseMBB);
@@ -6605,36 +6618,61 @@ static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
MachineBasicBlock *
SystemZTargetLowering::emitSelect(MachineInstr &MI,
MachineBasicBlock *MBB) const {
+ assert(isSelectPseudo(MI) && "Bad call to emitSelect()");
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
unsigned CCValid = MI.getOperand(3).getImm();
unsigned CCMask = MI.getOperand(4).getImm();
- DebugLoc DL = MI.getDebugLoc();
// If we have a sequence of Select* pseudo instructions using the
// same condition code value, we want to expand all of them into
// a single pair of basic blocks using the same condition.
- MachineInstr *LastMI = &MI;
- MachineBasicBlock::iterator NextMIIt = skipDebugInstructionsForward(
- std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-
- if (isSelectPseudo(MI))
- while (NextMIIt != MBB->end() && isSelectPseudo(*NextMIIt) &&
- NextMIIt->getOperand(3).getImm() == CCValid &&
- (NextMIIt->getOperand(4).getImm() == CCMask ||
- NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask))) {
- LastMI = &*NextMIIt;
- NextMIIt = skipDebugInstructionsForward(++NextMIIt, MBB->end());
+ SmallVector<MachineInstr*, 8> Selects;
+ SmallVector<MachineInstr*, 8> DbgValues;
+ Selects.push_back(&MI);
+ unsigned Count = 0;
+ for (MachineBasicBlock::iterator NextMIIt =
+ std::next(MachineBasicBlock::iterator(MI));
+ NextMIIt != MBB->end(); ++NextMIIt) {
+ if (NextMIIt->definesRegister(SystemZ::CC))
+ break;
+ if (isSelectPseudo(*NextMIIt)) {
+ assert(NextMIIt->getOperand(3).getImm() == CCValid &&
+ "Bad CCValid operands since CC was not redefined.");
+ if (NextMIIt->getOperand(4).getImm() == CCMask ||
+ NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask)) {
+ Selects.push_back(&*NextMIIt);
+ continue;
+ }
+ break;
}
+ bool User = false;
+ for (auto SelMI : Selects)
+ if (NextMIIt->readsVirtualRegister(SelMI->getOperand(0).getReg())) {
+ User = true;
+ break;
+ }
+ if (NextMIIt->isDebugInstr()) {
+ if (User) {
+ assert(NextMIIt->isDebugValue() && "Unhandled debug opcode.");
+ DbgValues.push_back(&*NextMIIt);
+ }
+ }
+ else if (User || ++Count > 20)
+ break;
+ }
+ MachineInstr *LastMI = Selects.back();
+ bool CCKilled =
+ (LastMI->killsRegister(SystemZ::CC) || checkCCKill(*LastMI, MBB));
MachineBasicBlock *StartMBB = MBB;
- MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB);
+ MachineBasicBlock *JoinMBB = splitBlockAfter(LastMI, MBB);
MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
// Unless CC was killed in the last Select instruction, mark it as
// live-in to both FalseMBB and JoinMBB.
- if (!LastMI->killsRegister(SystemZ::CC) && !checkCCKill(*LastMI, JoinMBB)) {
+ if (!CCKilled) {
FalseMBB->addLiveIn(SystemZ::CC);
JoinMBB->addLiveIn(SystemZ::CC);
}
@@ -6643,7 +6681,7 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI,
// BRC CCMask, JoinMBB
// # fallthrough to FalseMBB
MBB = StartMBB;
- BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ BuildMI(MBB, MI.getDebugLoc(), TII->get(SystemZ::BRC))
.addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
MBB->addSuccessor(JoinMBB);
MBB->addSuccessor(FalseMBB);
@@ -6657,12 +6695,14 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI,
// %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
// ...
MBB = JoinMBB;
- MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
- MachineBasicBlock::iterator MIItEnd = skipDebugInstructionsForward(
- std::next(MachineBasicBlock::iterator(LastMI)), MBB->end());
- createPHIsForSelects(MIItBegin, MIItEnd, StartMBB, FalseMBB, MBB);
+ createPHIsForSelects(Selects, StartMBB, FalseMBB, MBB);
+ for (auto SelMI : Selects)
+ SelMI->eraseFromParent();
+
+ MachineBasicBlock::iterator InsertPos = MBB->getFirstNonPHI();
+ for (auto DbgMI : DbgValues)
+ MBB->splice(InsertPos, StartMBB, DbgMI);
- StartMBB->erase(MIItBegin, MIItEnd);
return JoinMBB;
}
@@ -6678,10 +6718,10 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
- unsigned SrcReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(0).getReg();
MachineOperand Base = MI.getOperand(1);
int64_t Disp = MI.getOperand(2).getImm();
- unsigned IndexReg = MI.getOperand(3).getReg();
+ Register IndexReg = MI.getOperand(3).getReg();
unsigned CCValid = MI.getOperand(4).getImm();
unsigned CCMask = MI.getOperand(5).getImm();
DebugLoc DL = MI.getDebugLoc();
@@ -6773,7 +6813,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
// Extract the operands. Base can be a register or a frame index.
// Src2 can be a register or immediate.
- unsigned Dest = MI.getOperand(0).getReg();
+ Register Dest = MI.getOperand(0).getReg();
MachineOperand Base = earlyUseOperand(MI.getOperand(1));
int64_t Disp = MI.getOperand(2).getImm();
MachineOperand Src2 = earlyUseOperand(MI.getOperand(3));
@@ -6833,7 +6873,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
.addReg(OldVal).addReg(BitShift).addImm(0);
if (Invert) {
// Perform the operation normally and then invert every bit of the field.
- unsigned Tmp = MRI.createVirtualRegister(RC);
+ Register Tmp = MRI.createVirtualRegister(RC);
BuildMI(MBB, DL, TII->get(BinOpcode), Tmp).addReg(RotatedOldVal).add(Src2);
if (BitSize <= 32)
// XILF with the upper BitSize bits set.
@@ -6842,7 +6882,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
else {
// Use LCGR and add -1 to the result, which is more compact than
// an XILF, XILH pair.
- unsigned Tmp2 = MRI.createVirtualRegister(RC);
+ Register Tmp2 = MRI.createVirtualRegister(RC);
BuildMI(MBB, DL, TII->get(SystemZ::LCGR), Tmp2).addReg(Tmp);
BuildMI(MBB, DL, TII->get(SystemZ::AGHI), RotatedNewVal)
.addReg(Tmp2).addImm(-1);
@@ -6891,7 +6931,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
bool IsSubWord = (BitSize < 32);
// Extract the operands. Base can be a register or a frame index.
- unsigned Dest = MI.getOperand(0).getReg();
+ Register Dest = MI.getOperand(0).getReg();
MachineOperand Base = earlyUseOperand(MI.getOperand(1));
int64_t Disp = MI.getOperand(2).getImm();
Register Src2 = MI.getOperand(3).getReg();
@@ -7005,13 +7045,13 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
MachineRegisterInfo &MRI = MF.getRegInfo();
// Extract the operands. Base can be a register or a frame index.
- unsigned Dest = MI.getOperand(0).getReg();
+ Register Dest = MI.getOperand(0).getReg();
MachineOperand Base = earlyUseOperand(MI.getOperand(1));
int64_t Disp = MI.getOperand(2).getImm();
- unsigned OrigCmpVal = MI.getOperand(3).getReg();
- unsigned OrigSwapVal = MI.getOperand(4).getReg();
- unsigned BitShift = MI.getOperand(5).getReg();
- unsigned NegBitShift = MI.getOperand(6).getReg();
+ Register OrigCmpVal = MI.getOperand(3).getReg();
+ Register OrigSwapVal = MI.getOperand(4).getReg();
+ Register BitShift = MI.getOperand(5).getReg();
+ Register NegBitShift = MI.getOperand(6).getReg();
int64_t BitSize = MI.getOperand(7).getImm();
DebugLoc DL = MI.getDebugLoc();
@@ -7023,14 +7063,14 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
assert(LOpcode && CSOpcode && "Displacement out of range");
// Create virtual registers for temporary results.
- unsigned OrigOldVal = MRI.createVirtualRegister(RC);
- unsigned OldVal = MRI.createVirtualRegister(RC);
- unsigned CmpVal = MRI.createVirtualRegister(RC);
- unsigned SwapVal = MRI.createVirtualRegister(RC);
- unsigned StoreVal = MRI.createVirtualRegister(RC);
- unsigned RetryOldVal = MRI.createVirtualRegister(RC);
- unsigned RetryCmpVal = MRI.createVirtualRegister(RC);
- unsigned RetrySwapVal = MRI.createVirtualRegister(RC);
+ Register OrigOldVal = MRI.createVirtualRegister(RC);
+ Register OldVal = MRI.createVirtualRegister(RC);
+ Register CmpVal = MRI.createVirtualRegister(RC);
+ Register SwapVal = MRI.createVirtualRegister(RC);
+ Register StoreVal = MRI.createVirtualRegister(RC);
+ Register RetryOldVal = MRI.createVirtualRegister(RC);
+ Register RetryCmpVal = MRI.createVirtualRegister(RC);
+ Register RetrySwapVal = MRI.createVirtualRegister(RC);
// Insert 2 basic blocks for the loop.
MachineBasicBlock *StartMBB = MBB;
@@ -7129,11 +7169,11 @@ SystemZTargetLowering::emitPair128(MachineInstr &MI,
MachineRegisterInfo &MRI = MF.getRegInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Dest = MI.getOperand(0).getReg();
- unsigned Hi = MI.getOperand(1).getReg();
- unsigned Lo = MI.getOperand(2).getReg();
- unsigned Tmp1 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
- unsigned Tmp2 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+ Register Dest = MI.getOperand(0).getReg();
+ Register Hi = MI.getOperand(1).getReg();
+ Register Lo = MI.getOperand(2).getReg();
+ Register Tmp1 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+ Register Tmp2 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Tmp1);
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Tmp2)
@@ -7157,14 +7197,14 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
MachineRegisterInfo &MRI = MF.getRegInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned Dest = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
- unsigned In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+ Register Dest = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ Register In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), In128);
if (ClearEven) {
- unsigned NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
- unsigned Zero64 = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+ Register NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+ Register Zero64 = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
.addImm(0);
@@ -7308,7 +7348,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
// The previous iteration might have created out-of-range displacements.
// Apply them using LAY if so.
if (!isUInt<12>(DestDisp)) {
- unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
.add(DestBase)
.addImm(DestDisp)
@@ -7317,7 +7357,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
DestDisp = 0;
}
if (!isUInt<12>(SrcDisp)) {
- unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
.add(SrcBase)
.addImm(SrcDisp)
@@ -7474,11 +7514,11 @@ MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
DebugLoc DL = MI.getDebugLoc();
- unsigned SrcReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(0).getReg();
// Create new virtual register of the same class as source.
const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
- unsigned DstReg = MRI->createVirtualRegister(RC);
+ Register DstReg = MRI->createVirtualRegister(RC);
// Replace pseudo with a normal load-and-test that models the def as
// well.
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 19c7ec58ed3d..9c95e8aec940 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -25,10 +25,10 @@ let Predicates = [FeatureNoVectorEnhancements1] in
let Predicates = [FeatureVectorEnhancements1] in
def SelectVR128 : SelectWrapper<f128, VR128>;
-defm CondStoreF32 : CondStores<FP32, nonvolatile_store,
- nonvolatile_load, bdxaddr20only>;
-defm CondStoreF64 : CondStores<FP64, nonvolatile_store,
- nonvolatile_load, bdxaddr20only>;
+defm CondStoreF32 : CondStores<FP32, simple_store,
+ simple_load, bdxaddr20only>;
+defm CondStoreF64 : CondStores<FP64, simple_store,
+ simple_load, bdxaddr20only>;
//===----------------------------------------------------------------------===//
// Move instructions
@@ -276,13 +276,13 @@ let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in {
}
// fp_to_sint always rounds towards zero, which is modifier value 5.
-def : Pat<(i32 (fp_to_sint FP32:$src)), (CFEBR 5, FP32:$src)>;
-def : Pat<(i32 (fp_to_sint FP64:$src)), (CFDBR 5, FP64:$src)>;
-def : Pat<(i32 (fp_to_sint FP128:$src)), (CFXBR 5, FP128:$src)>;
+def : Pat<(i32 (any_fp_to_sint FP32:$src)), (CFEBR 5, FP32:$src)>;
+def : Pat<(i32 (any_fp_to_sint FP64:$src)), (CFDBR 5, FP64:$src)>;
+def : Pat<(i32 (any_fp_to_sint FP128:$src)), (CFXBR 5, FP128:$src)>;
-def : Pat<(i64 (fp_to_sint FP32:$src)), (CGEBR 5, FP32:$src)>;
-def : Pat<(i64 (fp_to_sint FP64:$src)), (CGDBR 5, FP64:$src)>;
-def : Pat<(i64 (fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>;
+def : Pat<(i64 (any_fp_to_sint FP32:$src)), (CGEBR 5, FP32:$src)>;
+def : Pat<(i64 (any_fp_to_sint FP64:$src)), (CGDBR 5, FP64:$src)>;
+def : Pat<(i64 (any_fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>;
// The FP extension feature provides versions of the above that allow
// also specifying the inexact-exception suppression flag.
@@ -309,13 +309,13 @@ let Predicates = [FeatureFPExtension] in {
def CLGXBR : TernaryRRFe<"clgxbr", 0xB3AE, GR64, FP128>;
}
- def : Pat<(i32 (fp_to_uint FP32:$src)), (CLFEBR 5, FP32:$src, 0)>;
- def : Pat<(i32 (fp_to_uint FP64:$src)), (CLFDBR 5, FP64:$src, 0)>;
- def : Pat<(i32 (fp_to_uint FP128:$src)), (CLFXBR 5, FP128:$src, 0)>;
+ def : Pat<(i32 (any_fp_to_uint FP32:$src)), (CLFEBR 5, FP32:$src, 0)>;
+ def : Pat<(i32 (any_fp_to_uint FP64:$src)), (CLFDBR 5, FP64:$src, 0)>;
+ def : Pat<(i32 (any_fp_to_uint FP128:$src)), (CLFXBR 5, FP128:$src, 0)>;
- def : Pat<(i64 (fp_to_uint FP32:$src)), (CLGEBR 5, FP32:$src, 0)>;
- def : Pat<(i64 (fp_to_uint FP64:$src)), (CLGDBR 5, FP64:$src, 0)>;
- def : Pat<(i64 (fp_to_uint FP128:$src)), (CLGXBR 5, FP128:$src, 0)>;
+ def : Pat<(i64 (any_fp_to_uint FP32:$src)), (CLGEBR 5, FP32:$src, 0)>;
+ def : Pat<(i64 (any_fp_to_uint FP64:$src)), (CLGDBR 5, FP64:$src, 0)>;
+ def : Pat<(i64 (any_fp_to_uint FP128:$src)), (CLGXBR 5, FP128:$src, 0)>;
}
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 2a1d14de3ddf..c9dbe3da686d 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2141,17 +2141,17 @@ class FixedCondBranchRXY<CondVariant V, string mnemonic, bits<16> opcode,
}
class CmpBranchRIEa<string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, cond4:$M3),
mnemonic#"$M3\t$R1, $I2", []>;
class AsmCmpBranchRIEa<string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, imm32zx4:$M3),
mnemonic#"\t$R1, $I2, $M3", []>;
class FixedCmpBranchRIEa<CondVariant V, string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2),
mnemonic#V.suffix#"\t$R1, $I2", []> {
let isAsmParserOnly = V.alternate;
@@ -2159,7 +2159,7 @@ class FixedCmpBranchRIEa<CondVariant V, string mnemonic, bits<16> opcode,
}
multiclass CmpBranchRIEaPair<string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm> {
+ RegisterOperand cls, ImmOpWithPattern imm> {
let isCodeGenOnly = 1 in
def "" : CmpBranchRIEa<mnemonic, opcode, cls, imm>;
def Asm : AsmCmpBranchRIEa<mnemonic, opcode, cls, imm>;
@@ -2193,19 +2193,19 @@ multiclass CmpBranchRIEbPair<string mnemonic, bits<16> opcode,
}
class CmpBranchRIEc<string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIEc<opcode, (outs),
(ins cls:$R1, imm:$I2, cond4:$M3, brtarget16:$RI4),
mnemonic#"$M3\t$R1, $I2, $RI4", []>;
class AsmCmpBranchRIEc<string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIEc<opcode, (outs),
(ins cls:$R1, imm:$I2, imm32zx4:$M3, brtarget16:$RI4),
mnemonic#"\t$R1, $I2, $M3, $RI4", []>;
class FixedCmpBranchRIEc<CondVariant V, string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIEc<opcode, (outs), (ins cls:$R1, imm:$I2, brtarget16:$RI4),
mnemonic#V.suffix#"\t$R1, $I2, $RI4", []> {
let isAsmParserOnly = V.alternate;
@@ -2213,7 +2213,7 @@ class FixedCmpBranchRIEc<CondVariant V, string mnemonic, bits<16> opcode,
}
multiclass CmpBranchRIEcPair<string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm> {
+ RegisterOperand cls, ImmOpWithPattern imm> {
let isCodeGenOnly = 1 in
def "" : CmpBranchRIEc<mnemonic, opcode, cls, imm>;
def Asm : AsmCmpBranchRIEc<mnemonic, opcode, cls, imm>;
@@ -2272,19 +2272,19 @@ multiclass CmpBranchRRSPair<string mnemonic, bits<16> opcode,
}
class CmpBranchRIS<string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIS<opcode, (outs),
(ins cls:$R1, imm:$I2, cond4:$M3, bdaddr12only:$BD4),
mnemonic#"$M3\t$R1, $I2, $BD4", []>;
class AsmCmpBranchRIS<string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIS<opcode, (outs),
(ins cls:$R1, imm:$I2, imm32zx4:$M3, bdaddr12only:$BD4),
mnemonic#"\t$R1, $I2, $M3, $BD4", []>;
class FixedCmpBranchRIS<CondVariant V, string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIS<opcode, (outs), (ins cls:$R1, imm:$I2, bdaddr12only:$BD4),
mnemonic#V.suffix#"\t$R1, $I2, $BD4", []> {
let isAsmParserOnly = V.alternate;
@@ -2292,7 +2292,7 @@ class FixedCmpBranchRIS<CondVariant V, string mnemonic, bits<16> opcode,
}
multiclass CmpBranchRISPair<string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm> {
+ RegisterOperand cls, ImmOpWithPattern imm> {
let isCodeGenOnly = 1 in
def "" : CmpBranchRIS<mnemonic, opcode, cls, imm>;
def Asm : AsmCmpBranchRIS<mnemonic, opcode, cls, imm>;
@@ -2585,7 +2585,7 @@ multiclass StoreMultipleVRSaAlign<string mnemonic, bits<16> opcode> {
// We therefore match the address in the same way as a normal store and
// only use the StoreSI* instruction if the matched address is suitable.
class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
- Immediate imm>
+ ImmOpWithPattern imm>
: InstSI<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2",
[(operator imm:$I2, mviaddr12pair:$BD1)]> {
@@ -2593,7 +2593,7 @@ class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
}
class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
- Immediate imm>
+ ImmOpWithPattern imm>
: InstSIY<opcode, (outs), (ins mviaddr20pair:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2",
[(operator imm:$I2, mviaddr20pair:$BD1)]> {
@@ -2601,7 +2601,7 @@ class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
}
class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
- Immediate imm>
+ ImmOpWithPattern imm>
: InstSIL<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2",
[(operator imm:$I2, mviaddr12pair:$BD1)]> {
@@ -2609,7 +2609,7 @@ class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
}
multiclass StoreSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
- SDPatternOperator operator, Immediate imm> {
+ SDPatternOperator operator, ImmOpWithPattern imm> {
let DispKey = mnemonic in {
let DispSize = "12" in
def "" : StoreSI<mnemonic, siOpcode, operator, imm>;
@@ -2665,7 +2665,7 @@ multiclass CondStoreRSYPair<string mnemonic, bits<16> opcode,
def Asm : AsmCondStoreRSY<mnemonic, opcode, cls, bytes, mode>;
}
-class SideEffectUnaryI<string mnemonic, bits<8> opcode, Immediate imm>
+class SideEffectUnaryI<string mnemonic, bits<8> opcode, ImmOpWithPattern imm>
: InstI<opcode, (outs), (ins imm:$I1),
mnemonic#"\t$I1", []>;
@@ -2761,13 +2761,13 @@ class UnaryMemRRFc<string mnemonic, bits<16> opcode,
}
class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIa<opcode, (outs cls:$R1), (ins imm:$I2),
mnemonic#"\t$R1, $I2",
[(set cls:$R1, (operator imm:$I2))]>;
class UnaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRILa<opcode, (outs cls:$R1), (ins imm:$I2),
mnemonic#"\t$R1, $I2",
[(set cls:$R1, (operator imm:$I2))]>;
@@ -2885,14 +2885,14 @@ multiclass UnaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
}
class UnaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
- TypedReg tr, Immediate imm, bits<4> type = 0>
+ TypedReg tr, ImmOpWithPattern imm, bits<4> type = 0>
: InstVRIa<opcode, (outs tr.op:$V1), (ins imm:$I2),
mnemonic#"\t$V1, $I2",
- [(set (tr.vt tr.op:$V1), (operator imm:$I2))]> {
+ [(set (tr.vt tr.op:$V1), (operator (i32 timm:$I2)))]> {
let M3 = type;
}
-class UnaryVRIaGeneric<string mnemonic, bits<16> opcode, Immediate imm>
+class UnaryVRIaGeneric<string mnemonic, bits<16> opcode, ImmOpWithPattern imm>
: InstVRIa<opcode, (outs VR128:$V1), (ins imm:$I2, imm32zx4:$M3),
mnemonic#"\t$V1, $I2, $M3", []>;
@@ -3021,7 +3021,7 @@ class SideEffectBinaryRRFc<string mnemonic, bits<16> opcode,
}
class SideEffectBinaryIE<string mnemonic, bits<16> opcode,
- Immediate imm1, Immediate imm2>
+ ImmOpWithPattern imm1, ImmOpWithPattern imm2>
: InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2),
mnemonic#"\t$I1, $I2", []>;
@@ -3030,7 +3030,7 @@ class SideEffectBinarySI<string mnemonic, bits<8> opcode, Operand imm>
mnemonic#"\t$BD1, $I2", []>;
class SideEffectBinarySIL<string mnemonic, bits<16> opcode,
- SDPatternOperator operator, Immediate imm>
+ SDPatternOperator operator, ImmOpWithPattern imm>
: InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2", [(operator bdaddr12only:$BD1, imm:$I2)]>;
@@ -3165,7 +3165,7 @@ class BinaryRRFc<string mnemonic, bits<16> opcode,
mnemonic#"\t$R1, $R2, $M3", []>;
class BinaryMemRRFc<string mnemonic, bits<16> opcode,
- RegisterOperand cls1, RegisterOperand cls2, Immediate imm>
+ RegisterOperand cls1, RegisterOperand cls2, ImmOpWithPattern imm>
: InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3),
mnemonic#"\t$R1, $R2, $M3", []> {
let Constraints = "$R1 = $R1src";
@@ -3267,7 +3267,7 @@ multiclass CondBinaryRRFaPair<string mnemonic, bits<16> opcode,
}
class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIa<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
mnemonic#"\t$R1, $I2",
[(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
@@ -3276,14 +3276,14 @@ class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
}
class BinaryRIE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIEd<opcode, (outs cls:$R1), (ins cls:$R3, imm:$I2),
mnemonic#"\t$R1, $R3, $I2",
[(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
multiclass BinaryRIAndK<string mnemonic, bits<12> opcode1, bits<16> opcode2,
SDPatternOperator operator, RegisterOperand cls,
- Immediate imm> {
+ ImmOpWithPattern imm> {
let NumOpsKey = mnemonic in {
let NumOpsValue = "3" in
def K : BinaryRIE<mnemonic##"k", opcode2, operator, cls, imm>,
@@ -3294,7 +3294,7 @@ multiclass BinaryRIAndK<string mnemonic, bits<12> opcode1, bits<16> opcode2,
}
class CondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
- Immediate imm>
+ ImmOpWithPattern imm>
: InstRIEg<opcode, (outs cls:$R1),
(ins cls:$R1src, imm:$I2, cond4:$valid, cond4:$M3),
mnemonic#"$M3\t$R1, $I2",
@@ -3308,7 +3308,7 @@ class CondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
// Like CondBinaryRIE, but used for the raw assembly form. The condition-code
// mask is the third operand rather than being part of the mnemonic.
class AsmCondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
- Immediate imm>
+ ImmOpWithPattern imm>
: InstRIEg<opcode, (outs cls:$R1),
(ins cls:$R1src, imm:$I2, imm32zx4:$M3),
mnemonic#"\t$R1, $I2, $M3", []> {
@@ -3318,7 +3318,7 @@ class AsmCondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
// Like CondBinaryRIE, but with a fixed CC mask.
class FixedCondBinaryRIE<CondVariant V, string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIEg<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
mnemonic#V.suffix#"\t$R1, $I2", []> {
let Constraints = "$R1 = $R1src";
@@ -3328,14 +3328,14 @@ class FixedCondBinaryRIE<CondVariant V, string mnemonic, bits<16> opcode,
}
multiclass CondBinaryRIEPair<string mnemonic, bits<16> opcode,
- RegisterOperand cls, Immediate imm> {
+ RegisterOperand cls, ImmOpWithPattern imm> {
let isCodeGenOnly = 1 in
def "" : CondBinaryRIE<mnemonic, opcode, cls, imm>;
def Asm : AsmCondBinaryRIE<mnemonic, opcode, cls, imm>;
}
class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRILa<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
mnemonic#"\t$R1, $I2",
[(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
@@ -3484,7 +3484,7 @@ class BinaryVRIb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr, bits<4> type>
: InstVRIb<opcode, (outs tr.op:$V1), (ins imm32zx8:$I2, imm32zx8:$I3),
mnemonic#"\t$V1, $I2, $I3",
- [(set (tr.vt tr.op:$V1), (operator imm32zx8:$I2, imm32zx8:$I3))]> {
+ [(set (tr.vt tr.op:$V1), (operator imm32zx8_timm:$I2, imm32zx8_timm:$I3))]> {
let M4 = type;
}
@@ -3498,7 +3498,7 @@ class BinaryVRIc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
: InstVRIc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, imm32zx16:$I2),
mnemonic#"\t$V1, $V3, $I2",
[(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V3),
- imm32zx16:$I2))]> {
+ imm32zx16_timm:$I2))]> {
let M4 = type;
}
@@ -3512,7 +3512,7 @@ class BinaryVRIe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
: InstVRIe<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx12:$I3),
mnemonic#"\t$V1, $V2, $I3",
[(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
- imm32zx12:$I3))]> {
+ imm32zx12_timm:$I3))]> {
let M4 = type;
let M5 = m5;
}
@@ -3715,7 +3715,7 @@ class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
: InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
mnemonic#"\t$V1, $XBD2, $M3",
[(set (tr.vt tr.op:$V1), (operator bdxaddr12only:$XBD2,
- imm32zx4:$M3))]> {
+ imm32zx4_timm:$M3))]> {
let mayLoad = 1;
let AccessBytes = bytes;
}
@@ -3765,7 +3765,7 @@ class BinaryVSI<string mnemonic, bits<16> opcode, SDPatternOperator operator,
}
class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
- Immediate index>
+ ImmOpWithPattern index>
: InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
mnemonic#"\t$V1, $VBD2, $M3", []> {
let mayStore = 1;
@@ -3774,7 +3774,7 @@ class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
class StoreBinaryVRX<string mnemonic, bits<16> opcode,
SDPatternOperator operator, TypedReg tr, bits<5> bytes,
- Immediate index>
+ ImmOpWithPattern index>
: InstVRX<opcode, (outs), (ins tr.op:$V1, bdxaddr12only:$XBD2, index:$M3),
mnemonic#"\t$V1, $XBD2, $M3",
[(operator (tr.vt tr.op:$V1), bdxaddr12only:$XBD2, index:$M3)]> {
@@ -3809,7 +3809,7 @@ class CompareRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
}
class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRIa<opcode, (outs), (ins cls:$R1, imm:$I2),
mnemonic#"\t$R1, $I2",
[(set CC, (operator cls:$R1, imm:$I2))]> {
@@ -3817,7 +3817,7 @@ class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
}
class CompareRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
- RegisterOperand cls, Immediate imm>
+ RegisterOperand cls, ImmOpWithPattern imm>
: InstRILa<opcode, (outs), (ins cls:$R1, imm:$I2),
mnemonic#"\t$R1, $I2",
[(set CC, (operator cls:$R1, imm:$I2))]> {
@@ -3924,7 +3924,7 @@ class CompareSSb<string mnemonic, bits<8> opcode>
}
class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
- SDPatternOperator load, Immediate imm,
+ SDPatternOperator load, ImmOpWithPattern imm,
AddressingMode mode = bdaddr12only>
: InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2",
@@ -3934,7 +3934,7 @@ class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
}
class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
- SDPatternOperator load, Immediate imm>
+ SDPatternOperator load, ImmOpWithPattern imm>
: InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2",
[(set CC, (operator (load bdaddr12only:$BD1), imm:$I2))]> {
@@ -3943,7 +3943,7 @@ class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
}
class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
- SDPatternOperator load, Immediate imm,
+ SDPatternOperator load, ImmOpWithPattern imm,
AddressingMode mode = bdaddr20only>
: InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2",
@@ -3954,7 +3954,7 @@ class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
multiclass CompareSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
SDPatternOperator operator, SDPatternOperator load,
- Immediate imm> {
+ ImmOpWithPattern imm> {
let DispKey = mnemonic in {
let DispSize = "12" in
def "" : CompareSI<mnemonic, siOpcode, operator, load, imm, bdaddr12pair>;
@@ -4012,7 +4012,7 @@ class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
}
class TestBinarySIL<string mnemonic, bits<16> opcode,
- SDPatternOperator operator, Immediate imm>
+ SDPatternOperator operator, ImmOpWithPattern imm>
: InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2",
[(set CC, (operator bdaddr12only:$BD1, imm:$I2))]>;
@@ -4073,7 +4073,7 @@ class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode,
class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode,
RegisterOperand cls1, RegisterOperand cls2,
- Immediate imm>
+ ImmOpWithPattern imm>
: InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3),
mnemonic#"\t$R1, $R2, $M3", []>;
@@ -4086,7 +4086,7 @@ multiclass SideEffectTernaryRRFcOpt<string mnemonic, bits<16> opcode,
class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode,
RegisterOperand cls1, RegisterOperand cls2,
- Immediate imm>
+ ImmOpWithPattern imm>
: InstRRFc<opcode, (outs cls1:$R1, cls2:$R2),
(ins cls1:$R1src, cls2:$R2src, imm:$M3),
mnemonic#"\t$R1, $R2, $M3", []> {
@@ -4221,7 +4221,7 @@ class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
}
class TernaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
- TypedReg tr1, TypedReg tr2, Immediate imm, Immediate index>
+ TypedReg tr1, TypedReg tr2, ImmOpWithPattern imm, ImmOpWithPattern index>
: InstVRIa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V1src, imm:$I2, index:$M3),
mnemonic#"\t$V1, $I2, $M3",
[(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
@@ -4237,7 +4237,7 @@ class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
mnemonic#"\t$V1, $V2, $V3, $I4",
[(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
(tr2.vt tr2.op:$V3),
- imm32zx8:$I4))]> {
+ imm32zx8_timm:$I4))]> {
let M5 = type;
}
@@ -4252,8 +4252,8 @@ class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
(ins tr2.op:$V2, imm32zx4:$M4, imm32zx4:$M5),
mnemonic#"\t$V1, $V2, $M4, $M5",
[(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
- imm32zx4:$M4,
- imm32zx4:$M5))],
+ imm32zx4_timm:$M4,
+ imm32zx4_timm:$M5))],
m4or> {
let M3 = type;
}
@@ -4285,13 +4285,13 @@ multiclass TernaryOptVRRbSPair<string mnemonic, bits<16> opcode,
TypedReg tr1, TypedReg tr2, bits<4> type,
bits<4> modifier = 0> {
def "" : TernaryVRRb<mnemonic, opcode, operator, tr1, tr2, type,
- imm32zx4even, !and (modifier, 14)>;
+ imm32zx4even_timm, !and (modifier, 14)>;
def : InstAlias<mnemonic#"\t$V1, $V2, $V3",
(!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
tr2.op:$V3, 0)>;
let Defs = [CC] in
def S : TernaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
- imm32zx4even, !add(!and (modifier, 14), 1)>;
+ imm32zx4even_timm, !add(!and (modifier, 14), 1)>;
def : InstAlias<mnemonic#"s\t$V1, $V2, $V3",
(!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
tr2.op:$V3, 0)>;
@@ -4314,7 +4314,7 @@ class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
mnemonic#"\t$V1, $V2, $V3, $M4",
[(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
(tr2.vt tr2.op:$V3),
- imm32zx4:$M4))]> {
+ imm32zx4_timm:$M4))]> {
let M5 = 0;
let M6 = 0;
}
@@ -4327,7 +4327,7 @@ class TernaryVRRcFloat<string mnemonic, bits<16> opcode,
mnemonic#"\t$V1, $V2, $V3, $M6",
[(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
(tr2.vt tr2.op:$V3),
- imm32zx4:$M6))]> {
+ imm32zx4_timm:$M6))]> {
let M4 = type;
let M5 = m5;
}
@@ -4429,7 +4429,7 @@ class TernaryVRSbGeneric<string mnemonic, bits<16> opcode>
}
class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
- Immediate index>
+ ImmOpWithPattern index>
: InstVRV<opcode, (outs VR128:$V1),
(ins VR128:$V1src, bdvaddr12only:$VBD2, index:$M3),
mnemonic#"\t$V1, $VBD2, $M3", []> {
@@ -4440,7 +4440,7 @@ class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
}
class TernaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
- TypedReg tr1, TypedReg tr2, bits<5> bytes, Immediate index>
+ TypedReg tr1, TypedReg tr2, bits<5> bytes, ImmOpWithPattern index>
: InstVRX<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V1src, bdxaddr12only:$XBD2, index:$M3),
mnemonic#"\t$V1, $XBD2, $M3",
@@ -4461,7 +4461,7 @@ class QuaternaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operato
[(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V1src),
(tr2.vt tr2.op:$V2),
(tr2.vt tr2.op:$V3),
- imm32zx8:$I4))]> {
+ imm32zx8_timm:$I4))]> {
let Constraints = "$V1 = $V1src";
let DisableEncoding = "$V1src";
let M5 = type;
@@ -4480,7 +4480,7 @@ class QuaternaryVRIf<string mnemonic, bits<16> opcode>
: InstVRIf<opcode, (outs VR128:$V1),
(ins VR128:$V2, VR128:$V3,
imm32zx8:$I4, imm32zx4:$M5),
- mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []>;
+ mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []>;
class QuaternaryVRIg<string mnemonic, bits<16> opcode>
: InstVRIg<opcode, (outs VR128:$V1),
@@ -4491,7 +4491,7 @@ class QuaternaryVRIg<string mnemonic, bits<16> opcode>
class QuaternaryVRRd<string mnemonic, bits<16> opcode,
SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
TypedReg tr3, TypedReg tr4, bits<4> type,
- SDPatternOperator m6mask = imm32zx4, bits<4> m6or = 0>
+ SDPatternOperator m6mask = imm32zx4_timm, bits<4> m6or = 0>
: InstVRRd<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V2, tr3.op:$V3, tr4.op:$V4, m6mask:$M6),
mnemonic#"\t$V1, $V2, $V3, $V4, $M6",
@@ -4518,14 +4518,14 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
bits<4> modifier = 0> {
def "" : QuaternaryVRRd<mnemonic, opcode, operator,
tr1, tr2, tr2, tr2, type,
- imm32zx4even, !and (modifier, 14)>;
+ imm32zx4even_timm, !and (modifier, 14)>;
def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4",
(!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
tr2.op:$V3, tr2.op:$V4, 0)>;
let Defs = [CC] in
def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc,
tr1, tr2, tr2, tr2, type,
- imm32zx4even, !add (!and (modifier, 14), 1)>;
+ imm32zx4even_timm, !add (!and (modifier, 14), 1)>;
def : InstAlias<mnemonic#"s\t$V1, $V2, $V3, $V4",
(!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
tr2.op:$V3, tr2.op:$V4, 0)>;
@@ -4536,7 +4536,7 @@ multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> {
def "" : QuaternaryVRRdGeneric<mnemonic, opcode>;
def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4, $M5",
(!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
- VR128:$V4, imm32zx4:$M5, 0)>;
+ VR128:$V4, imm32zx4_timm:$M5, 0)>;
}
class SideEffectQuaternaryRRFa<string mnemonic, bits<16> opcode,
@@ -4638,13 +4638,13 @@ class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
class PrefetchRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator>
: InstRXYb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr20only:$XBD2),
mnemonic##"\t$M1, $XBD2",
- [(operator imm32zx4:$M1, bdxaddr20only:$XBD2)]>;
+ [(operator imm32zx4_timm:$M1, bdxaddr20only:$XBD2)]>;
class PrefetchRILPC<string mnemonic, bits<12> opcode,
SDPatternOperator operator>
- : InstRILc<opcode, (outs), (ins imm32zx4:$M1, pcrel32:$RI2),
+ : InstRILc<opcode, (outs), (ins imm32zx4_timm:$M1, pcrel32:$RI2),
mnemonic##"\t$M1, $RI2",
- [(operator imm32zx4:$M1, pcrel32:$RI2)]> {
+ [(operator imm32zx4_timm:$M1, pcrel32:$RI2)]> {
// We want PC-relative addresses to be tried ahead of BD and BDX addresses.
// However, BDXs have two extra operands and are therefore 6 units more
// complex.
@@ -4691,7 +4691,7 @@ class Pseudo<dag outs, dag ins, list<dag> pattern>
// Like UnaryRI, but expanded after RA depending on the choice of register.
class UnaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
- Immediate imm>
+ ImmOpWithPattern imm>
: Pseudo<(outs cls:$R1), (ins imm:$I2),
[(set cls:$R1, (operator imm:$I2))]>;
@@ -4720,7 +4720,7 @@ class UnaryRRPseudo<string key, SDPatternOperator operator,
// Like BinaryRI, but expanded after RA depending on the choice of register.
class BinaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
- Immediate imm>
+ ImmOpWithPattern imm>
: Pseudo<(outs cls:$R1), (ins cls:$R1src, imm:$I2),
[(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
let Constraints = "$R1 = $R1src";
@@ -4728,13 +4728,13 @@ class BinaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
// Like BinaryRIE, but expanded after RA depending on the choice of register.
class BinaryRIEPseudo<SDPatternOperator operator, RegisterOperand cls,
- Immediate imm>
+ ImmOpWithPattern imm>
: Pseudo<(outs cls:$R1), (ins cls:$R3, imm:$I2),
[(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
// Like BinaryRIAndK, but expanded after RA depending on the choice of register.
multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator,
- RegisterOperand cls, Immediate imm> {
+ RegisterOperand cls, ImmOpWithPattern imm> {
let NumOpsKey = key in {
let NumOpsValue = "3" in
def K : BinaryRIEPseudo<operator, cls, imm>,
@@ -4764,7 +4764,7 @@ class MemFoldPseudo<string mnemonic, RegisterOperand cls, bits<5> bytes,
// Like CompareRI, but expanded after RA depending on the choice of register.
class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls,
- Immediate imm>
+ ImmOpWithPattern imm>
: Pseudo<(outs), (ins cls:$R1, imm:$I2),
[(set CC, (operator cls:$R1, imm:$I2))]> {
let isCompare = 1;
@@ -4783,7 +4783,7 @@ class CompareRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
}
// Like TestBinarySIL, but expanded later.
-class TestBinarySILPseudo<SDPatternOperator operator, Immediate imm>
+class TestBinarySILPseudo<SDPatternOperator operator, ImmOpWithPattern imm>
: Pseudo<(outs), (ins bdaddr12only:$BD1, imm:$I2),
[(set CC, (operator bdaddr12only:$BD1, imm:$I2))]>;
@@ -4812,7 +4812,7 @@ class CondBinaryRRFaPseudo<RegisterOperand cls1, RegisterOperand cls2,
// Like CondBinaryRIE, but expanded after RA depending on the choice of
// register.
-class CondBinaryRIEPseudo<RegisterOperand cls, Immediate imm>
+class CondBinaryRIEPseudo<RegisterOperand cls, ImmOpWithPattern imm>
: Pseudo<(outs cls:$R1),
(ins cls:$R1src, imm:$I2, cond4:$valid, cond4:$M3),
[(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src,
@@ -4876,7 +4876,7 @@ class SelectWrapper<ValueType vt, RegisterOperand cls>
: Pseudo<(outs cls:$dst),
(ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc),
[(set (vt cls:$dst), (z_select_ccmask cls:$src1, cls:$src2,
- imm32zx4:$valid, imm32zx4:$cc))]> {
+ imm32zx4_timm:$valid, imm32zx4_timm:$cc))]> {
let usesCustomInserter = 1;
let hasNoSchedulingInfo = 1;
let Uses = [CC];
@@ -4890,12 +4890,12 @@ multiclass CondStores<RegisterOperand cls, SDPatternOperator store,
def "" : Pseudo<(outs),
(ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
[(store (z_select_ccmask cls:$new, (load mode:$addr),
- imm32zx4:$valid, imm32zx4:$cc),
+ imm32zx4_timm:$valid, imm32zx4_timm:$cc),
mode:$addr)]>;
def Inv : Pseudo<(outs),
(ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
[(store (z_select_ccmask (load mode:$addr), cls:$new,
- imm32zx4:$valid, imm32zx4:$cc),
+ imm32zx4_timm:$valid, imm32zx4_timm:$cc),
mode:$addr)]>;
}
}
@@ -4917,11 +4917,11 @@ class AtomicLoadBinary<SDPatternOperator operator, RegisterOperand cls,
// Specializations of AtomicLoadWBinary.
class AtomicLoadBinaryReg32<SDPatternOperator operator>
: AtomicLoadBinary<operator, GR32, (i32 GR32:$src2), GR32>;
-class AtomicLoadBinaryImm32<SDPatternOperator operator, Immediate imm>
+class AtomicLoadBinaryImm32<SDPatternOperator operator, ImmOpWithPattern imm>
: AtomicLoadBinary<operator, GR32, (i32 imm:$src2), imm>;
class AtomicLoadBinaryReg64<SDPatternOperator operator>
: AtomicLoadBinary<operator, GR64, (i64 GR64:$src2), GR64>;
-class AtomicLoadBinaryImm64<SDPatternOperator operator, Immediate imm>
+class AtomicLoadBinaryImm64<SDPatternOperator operator, ImmOpWithPattern imm>
: AtomicLoadBinary<operator, GR64, (i64 imm:$src2), imm>;
// OPERATOR is ATOMIC_SWAPW or an ATOMIC_LOADW_* operation. PAT and OPERAND
@@ -4944,7 +4944,7 @@ class AtomicLoadWBinary<SDPatternOperator operator, dag pat,
// Specializations of AtomicLoadWBinary.
class AtomicLoadWBinaryReg<SDPatternOperator operator>
: AtomicLoadWBinary<operator, (i32 GR32:$src2), GR32>;
-class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
+class AtomicLoadWBinaryImm<SDPatternOperator operator, ImmOpWithPattern imm>
: AtomicLoadWBinary<operator, (i32 imm:$src2), imm>;
// A pseudo instruction that is a direct alias of a real instruction.
@@ -4979,7 +4979,7 @@ class StoreAliasVRX<SDPatternOperator operator, TypedReg tr,
// An alias of a BinaryRI, but with different register sizes.
class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
- Immediate imm>
+ ImmOpWithPattern imm>
: Alias<4, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
[(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
let Constraints = "$R1 = $R1src";
@@ -4987,7 +4987,7 @@ class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
// An alias of a BinaryRIL, but with different register sizes.
class BinaryAliasRIL<SDPatternOperator operator, RegisterOperand cls,
- Immediate imm>
+ ImmOpWithPattern imm>
: Alias<6, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
[(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
let Constraints = "$R1 = $R1src";
@@ -4999,7 +4999,7 @@ class BinaryAliasVRRf<RegisterOperand cls>
// An alias of a CompareRI, but with different register sizes.
class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls,
- Immediate imm>
+ ImmOpWithPattern imm>
: Alias<4, (outs), (ins cls:$R1, imm:$I2),
[(set CC, (operator cls:$R1, imm:$I2))]> {
let isCompare = 1;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 57c1cf4ec70a..bc783608d45b 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -46,22 +46,12 @@ using namespace llvm;
#include "SystemZGenInstrInfo.inc"
#define DEBUG_TYPE "systemz-II"
-STATISTIC(LOCRMuxJumps, "Number of LOCRMux jump-sequences (lower is better)");
// Return a mask with Count low bits set.
static uint64_t allOnes(unsigned int Count) {
return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
}
-// Reg should be a 32-bit GPR. Return true if it is a high register rather
-// than a low register.
-static bool isHighReg(unsigned int Reg) {
- if (SystemZ::GRH32BitRegClass.contains(Reg))
- return true;
- assert(SystemZ::GR32BitRegClass.contains(Reg) && "Invalid GRX32");
- return false;
-}
-
// Pin the vtable to this file.
void SystemZInstrInfo::anchor() {}
@@ -85,7 +75,7 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
// Set up the two 64-bit registers and remember super reg and its flags.
MachineOperand &HighRegOp = EarlierMI->getOperand(0);
MachineOperand &LowRegOp = MI->getOperand(0);
- unsigned Reg128 = LowRegOp.getReg();
+ Register Reg128 = LowRegOp.getReg();
unsigned Reg128Killed = getKillRegState(LowRegOp.isKill());
unsigned Reg128Undef = getUndefRegState(LowRegOp.isUndef());
HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64));
@@ -147,8 +137,8 @@ void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const {
void SystemZInstrInfo::expandRIPseudo(MachineInstr &MI, unsigned LowOpcode,
unsigned HighOpcode,
bool ConvertHigh) const {
- unsigned Reg = MI.getOperand(0).getReg();
- bool IsHigh = isHighReg(Reg);
+ Register Reg = MI.getOperand(0).getReg();
+ bool IsHigh = SystemZ::isHighReg(Reg);
MI.setDesc(get(IsHigh ? HighOpcode : LowOpcode));
if (IsHigh && ConvertHigh)
MI.getOperand(1).setImm(uint32_t(MI.getOperand(1).getImm()));
@@ -161,10 +151,10 @@ void SystemZInstrInfo::expandRIPseudo(MachineInstr &MI, unsigned LowOpcode,
void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
unsigned LowOpcodeK,
unsigned HighOpcode) const {
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
- bool DestIsHigh = isHighReg(DestReg);
- bool SrcIsHigh = isHighReg(SrcReg);
+ Register DestReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ bool DestIsHigh = SystemZ::isHighReg(DestReg);
+ bool SrcIsHigh = SystemZ::isHighReg(SrcReg);
if (!DestIsHigh && !SrcIsHigh)
MI.setDesc(get(LowOpcodeK));
else {
@@ -184,9 +174,10 @@ void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
// is a high GR32.
void SystemZInstrInfo::expandRXYPseudo(MachineInstr &MI, unsigned LowOpcode,
unsigned HighOpcode) const {
- unsigned Reg = MI.getOperand(0).getReg();
- unsigned Opcode = getOpcodeForOffset(isHighReg(Reg) ? HighOpcode : LowOpcode,
- MI.getOperand(2).getImm());
+ Register Reg = MI.getOperand(0).getReg();
+ unsigned Opcode = getOpcodeForOffset(
+ SystemZ::isHighReg(Reg) ? HighOpcode : LowOpcode,
+ MI.getOperand(2).getImm());
MI.setDesc(get(Opcode));
}
@@ -195,93 +186,11 @@ void SystemZInstrInfo::expandRXYPseudo(MachineInstr &MI, unsigned LowOpcode,
// register is a low GR32 and HighOpcode if the register is a high GR32.
void SystemZInstrInfo::expandLOCPseudo(MachineInstr &MI, unsigned LowOpcode,
unsigned HighOpcode) const {
- unsigned Reg = MI.getOperand(0).getReg();
- unsigned Opcode = isHighReg(Reg) ? HighOpcode : LowOpcode;
+ Register Reg = MI.getOperand(0).getReg();
+ unsigned Opcode = SystemZ::isHighReg(Reg) ? HighOpcode : LowOpcode;
MI.setDesc(get(Opcode));
}
-// MI is a load-register-on-condition pseudo instruction. Replace it with
-// LowOpcode if source and destination are both low GR32s and HighOpcode if
-// source and destination are both high GR32s.
-void SystemZInstrInfo::expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
- unsigned HighOpcode) const {
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(2).getReg();
- bool DestIsHigh = isHighReg(DestReg);
- bool SrcIsHigh = isHighReg(SrcReg);
-
- if (!DestIsHigh && !SrcIsHigh)
- MI.setDesc(get(LowOpcode));
- else if (DestIsHigh && SrcIsHigh)
- MI.setDesc(get(HighOpcode));
- else
- LOCRMuxJumps++;
-
- // If we were unable to implement the pseudo with a single instruction, we
- // need to convert it back into a branch sequence. This cannot be done here
- // since the caller of expandPostRAPseudo does not handle changes to the CFG
- // correctly. This change is defered to the SystemZExpandPseudo pass.
-}
-
-// MI is a select pseudo instruction. Replace it with LowOpcode if source
-// and destination are all low GR32s and HighOpcode if source and destination
-// are all high GR32s. Otherwise, use the two-operand MixedOpcode.
-void SystemZInstrInfo::expandSELRPseudo(MachineInstr &MI, unsigned LowOpcode,
- unsigned HighOpcode,
- unsigned MixedOpcode) const {
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned Src1Reg = MI.getOperand(1).getReg();
- unsigned Src2Reg = MI.getOperand(2).getReg();
- bool DestIsHigh = isHighReg(DestReg);
- bool Src1IsHigh = isHighReg(Src1Reg);
- bool Src2IsHigh = isHighReg(Src2Reg);
-
- // If sources and destination aren't all high or all low, we may be able to
- // simplify the operation by moving one of the sources to the destination
- // first. But only if this doesn't clobber the other source.
- if (DestReg != Src1Reg && DestReg != Src2Reg) {
- if (DestIsHigh != Src1IsHigh) {
- emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, Src1Reg,
- SystemZ::LR, 32, MI.getOperand(1).isKill(),
- MI.getOperand(1).isUndef());
- MI.getOperand(1).setReg(DestReg);
- Src1Reg = DestReg;
- Src1IsHigh = DestIsHigh;
- } else if (DestIsHigh != Src2IsHigh) {
- emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, Src2Reg,
- SystemZ::LR, 32, MI.getOperand(2).isKill(),
- MI.getOperand(2).isUndef());
- MI.getOperand(2).setReg(DestReg);
- Src2Reg = DestReg;
- Src2IsHigh = DestIsHigh;
- }
- }
-
- // If the destination (now) matches one source, prefer this to be first.
- if (DestReg != Src1Reg && DestReg == Src2Reg) {
- commuteInstruction(MI, false, 1, 2);
- std::swap(Src1Reg, Src2Reg);
- std::swap(Src1IsHigh, Src2IsHigh);
- }
-
- if (!DestIsHigh && !Src1IsHigh && !Src2IsHigh)
- MI.setDesc(get(LowOpcode));
- else if (DestIsHigh && Src1IsHigh && Src2IsHigh)
- MI.setDesc(get(HighOpcode));
- else {
- // Given the simplifcation above, we must already have a two-operand case.
- assert (DestReg == Src1Reg);
- MI.setDesc(get(MixedOpcode));
- MI.tieOperands(0, 1);
- LOCRMuxJumps++;
- }
-
- // If we were unable to implement the pseudo with a single instruction, we
- // need to convert it back into a branch sequence. This cannot be done here
- // since the caller of expandPostRAPseudo does not handle changes to the CFG
- // correctly. This change is defered to the SystemZExpandPseudo pass.
-}
-
// MI is an RR-style pseudo instruction that zero-extends the low Size bits
// of one GRX32 into another. Replace it with LowOpcode if both operands
// are low registers, otherwise use RISB[LH]G.
@@ -302,8 +211,8 @@ void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const {
MachineBasicBlock *MBB = MI->getParent();
MachineFunction &MF = *MBB->getParent();
- const unsigned Reg64 = MI->getOperand(0).getReg();
- const unsigned Reg32 = RI.getSubReg(Reg64, SystemZ::subreg_l32);
+ const Register Reg64 = MI->getOperand(0).getReg();
+ const Register Reg32 = RI.getSubReg(Reg64, SystemZ::subreg_l32);
// EAR can only load the low subregister so us a shift for %a0 to produce
// the GR containing %a0 and %a1.
@@ -341,8 +250,8 @@ SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
unsigned Size, bool KillSrc,
bool UndefSrc) const {
unsigned Opcode;
- bool DestIsHigh = isHighReg(DestReg);
- bool SrcIsHigh = isHighReg(SrcReg);
+ bool DestIsHigh = SystemZ::isHighReg(DestReg);
+ bool SrcIsHigh = SystemZ::isHighReg(SrcReg);
if (DestIsHigh && SrcIsHigh)
Opcode = SystemZ::RISBHH;
else if (DestIsHigh && !SrcIsHigh)
@@ -468,7 +377,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
// Can't handle indirect branches.
SystemZII::Branch Branch(getBranchInfo(*I));
- if (!Branch.Target->isMBB())
+ if (!Branch.hasMBBTarget())
return true;
// Punt on compound branches.
@@ -478,7 +387,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
if (Branch.CCMask == SystemZ::CCMASK_ANY) {
// Handle unconditional branches.
if (!AllowModify) {
- TBB = Branch.Target->getMBB();
+ TBB = Branch.getMBBTarget();
continue;
}
@@ -490,7 +399,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
FBB = nullptr;
// Delete the JMP if it's equivalent to a fall-through.
- if (MBB.isLayoutSuccessor(Branch.Target->getMBB())) {
+ if (MBB.isLayoutSuccessor(Branch.getMBBTarget())) {
TBB = nullptr;
I->eraseFromParent();
I = MBB.end();
@@ -498,7 +407,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
}
// TBB is used to indicate the unconditinal destination.
- TBB = Branch.Target->getMBB();
+ TBB = Branch.getMBBTarget();
continue;
}
@@ -506,7 +415,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
if (Cond.empty()) {
// FIXME: add X86-style branch swap
FBB = TBB;
- TBB = Branch.Target->getMBB();
+ TBB = Branch.getMBBTarget();
Cond.push_back(MachineOperand::CreateImm(Branch.CCValid));
Cond.push_back(MachineOperand::CreateImm(Branch.CCMask));
continue;
@@ -517,7 +426,7 @@ bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
// Only handle the case where all conditional branches branch to the same
// destination.
- if (TBB != Branch.Target->getMBB())
+ if (TBB != Branch.getMBBTarget())
return true;
// If the conditions are the same, we can leave them alone.
@@ -547,7 +456,7 @@ unsigned SystemZInstrInfo::removeBranch(MachineBasicBlock &MBB,
continue;
if (!I->isBranch())
break;
- if (!getBranchInfo(*I).Target->isMBB())
+ if (!getBranchInfo(*I).hasMBBTarget())
break;
// Remove the branch.
I->eraseFromParent();
@@ -676,8 +585,8 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
else {
Opc = SystemZ::LOCR;
MRI.constrainRegClass(DstReg, &SystemZ::GR32BitRegClass);
- unsigned TReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
- unsigned FReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
+ Register TReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
+ Register FReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
BuildMI(MBB, I, DL, get(TargetOpcode::COPY), TReg).addReg(TrueReg);
BuildMI(MBB, I, DL, get(TargetOpcode::COPY), FReg).addReg(FalseReg);
TrueReg = TReg;
@@ -1258,13 +1167,14 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
assert(NumOps == 3 && "Expected two source registers.");
Register DstReg = MI.getOperand(0).getReg();
Register DstPhys =
- (TRI->isVirtualRegister(DstReg) ? VRM->getPhys(DstReg) : DstReg);
+ (Register::isVirtualRegister(DstReg) ? VRM->getPhys(DstReg) : DstReg);
Register SrcReg = (OpNum == 2 ? MI.getOperand(1).getReg()
: ((OpNum == 1 && MI.isCommutable())
? MI.getOperand(2).getReg()
: Register()));
if (DstPhys && !SystemZ::GRH32BitRegClass.contains(DstPhys) && SrcReg &&
- TRI->isVirtualRegister(SrcReg) && DstPhys == VRM->getPhys(SrcReg))
+ Register::isVirtualRegister(SrcReg) &&
+ DstPhys == VRM->getPhys(SrcReg))
NeedsCommute = (OpNum == 1);
else
MemOpcode = -1;
@@ -1358,15 +1268,6 @@ bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
expandLOCPseudo(MI, SystemZ::LOCHI, SystemZ::LOCHHI);
return true;
- case SystemZ::LOCRMux:
- expandLOCRPseudo(MI, SystemZ::LOCR, SystemZ::LOCFHR);
- return true;
-
- case SystemZ::SELRMux:
- expandSELRPseudo(MI, SystemZ::SELR, SystemZ::SELFHR,
- SystemZ::LOCRMux);
- return true;
-
case SystemZ::STCMux:
expandRXYPseudo(MI, SystemZ::STC, SystemZ::STCH);
return true;
@@ -1468,8 +1369,8 @@ bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
case SystemZ::RISBMux: {
- bool DestIsHigh = isHighReg(MI.getOperand(0).getReg());
- bool SrcIsHigh = isHighReg(MI.getOperand(2).getReg());
+ bool DestIsHigh = SystemZ::isHighReg(MI.getOperand(0).getReg());
+ bool SrcIsHigh = SystemZ::isHighReg(MI.getOperand(2).getReg());
if (SrcIsHigh == DestIsHigh)
MI.setDesc(get(DestIsHigh ? SystemZ::RISBHH : SystemZ::RISBLL));
else {
@@ -1545,6 +1446,10 @@ SystemZInstrInfo::getBranchInfo(const MachineInstr &MI) const {
return SystemZII::Branch(SystemZII::BranchCLG, SystemZ::CCMASK_ICMP,
MI.getOperand(2).getImm(), &MI.getOperand(3));
+ case SystemZ::INLINEASM_BR:
+ // Don't try to analyze asm goto, so pass nullptr as branch target argument.
+ return SystemZII::Branch(SystemZII::AsmGoto, 0, 0, nullptr);
+
default:
llvm_unreachable("Unrecognized branch opcode");
}
@@ -1845,8 +1750,7 @@ void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB,
bool SystemZInstrInfo::
areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
- const MachineInstr &MIb,
- AliasAnalysis *AA) const {
+ const MachineInstr &MIb) const {
if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand())
return false;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 2edde175542e..6dc6e72aa52a 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -100,11 +100,18 @@ enum BranchType {
// An instruction that decrements a 64-bit register and branches if
// the result is nonzero.
- BranchCTG
+ BranchCTG,
+
+ // An instruction representing an asm goto statement.
+ AsmGoto
};
// Information about a branch instruction.
-struct Branch {
+class Branch {
+ // The target of the branch. In case of INLINEASM_BR, this is nullptr.
+ const MachineOperand *Target;
+
+public:
// The type of the branch.
BranchType Type;
@@ -114,12 +121,15 @@ struct Branch {
// CCMASK_<N> is set if the branch should be taken when CC == N.
unsigned CCMask;
- // The target of the branch.
- const MachineOperand *Target;
-
Branch(BranchType type, unsigned ccValid, unsigned ccMask,
const MachineOperand *target)
- : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {}
+ : Target(target), Type(type), CCValid(ccValid), CCMask(ccMask) {}
+
+ bool isIndirect() { return Target != nullptr && Target->isReg(); }
+ bool hasMBBTarget() { return Target != nullptr && Target->isMBB(); }
+ MachineBasicBlock *getMBBTarget() {
+ return hasMBBTarget() ? Target->getMBB() : nullptr;
+ }
};
// Kinds of fused compares in compare-and-* instructions. Together with type
@@ -160,10 +170,6 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
unsigned HighOpcode) const;
void expandLOCPseudo(MachineInstr &MI, unsigned LowOpcode,
unsigned HighOpcode) const;
- void expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
- unsigned HighOpcode) const;
- void expandSELRPseudo(MachineInstr &MI, unsigned LowOpcode,
- unsigned HighOpcode, unsigned MixedOpcode) const;
void expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
unsigned Size) const;
void expandLoadStackGuard(MachineInstr *MI) const;
@@ -322,8 +328,7 @@ public:
// memory addresses and false otherwise.
bool
areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
- const MachineInstr &MIb,
- AliasAnalysis *AA = nullptr) const override;
+ const MachineInstr &MIb) const override;
};
} // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 91856893e3bd..8b334756611a 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -337,15 +337,15 @@ defm CondStore8Mux : CondStores<GRX32, nonvolatile_truncstorei8,
defm CondStore16Mux : CondStores<GRX32, nonvolatile_truncstorei16,
nonvolatile_anyextloadi16, bdxaddr20only>,
Requires<[FeatureHighWord]>;
-defm CondStore32Mux : CondStores<GRX32, nonvolatile_store,
- nonvolatile_load, bdxaddr20only>,
+defm CondStore32Mux : CondStores<GRX32, simple_store,
+ simple_load, bdxaddr20only>,
Requires<[FeatureLoadStoreOnCond2]>;
defm CondStore8 : CondStores<GR32, nonvolatile_truncstorei8,
nonvolatile_anyextloadi8, bdxaddr20only>;
defm CondStore16 : CondStores<GR32, nonvolatile_truncstorei16,
nonvolatile_anyextloadi16, bdxaddr20only>;
-defm CondStore32 : CondStores<GR32, nonvolatile_store,
- nonvolatile_load, bdxaddr20only>;
+defm CondStore32 : CondStores<GR32, simple_store,
+ simple_load, bdxaddr20only>;
defm : CondStores64<CondStore8, CondStore8Inv, nonvolatile_truncstorei8,
nonvolatile_anyextloadi8, bdxaddr20only>;
@@ -353,8 +353,8 @@ defm : CondStores64<CondStore16, CondStore16Inv, nonvolatile_truncstorei16,
nonvolatile_anyextloadi16, bdxaddr20only>;
defm : CondStores64<CondStore32, CondStore32Inv, nonvolatile_truncstorei32,
nonvolatile_anyextloadi32, bdxaddr20only>;
-defm CondStore64 : CondStores<GR64, nonvolatile_store,
- nonvolatile_load, bdxaddr20only>;
+defm CondStore64 : CondStores<GR64, simple_store,
+ simple_load, bdxaddr20only>;
//===----------------------------------------------------------------------===//
// Move instructions
@@ -531,8 +531,8 @@ let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in {
// Load on condition. Matched via DAG pattern.
// Expands to LOC or LOCFH, depending on the choice of register.
- def LOCMux : CondUnaryRSYPseudo<nonvolatile_load, GRX32, 4>;
- defm LOCFH : CondUnaryRSYPair<"locfh", 0xEBE0, nonvolatile_load, GRH32, 4>;
+ def LOCMux : CondUnaryRSYPseudo<simple_load, GRX32, 4>;
+ defm LOCFH : CondUnaryRSYPair<"locfh", 0xEBE0, simple_load, GRH32, 4>;
// Store on condition. Expanded from CondStore* pseudos.
// Expands to STOC or STOCFH, depending on the choice of register.
@@ -563,8 +563,8 @@ let Predicates = [FeatureLoadStoreOnCond], Uses = [CC] in {
}
// Load on condition. Matched via DAG pattern.
- defm LOC : CondUnaryRSYPair<"loc", 0xEBF2, nonvolatile_load, GR32, 4>;
- defm LOCG : CondUnaryRSYPair<"locg", 0xEBE2, nonvolatile_load, GR64, 8>;
+ defm LOC : CondUnaryRSYPair<"loc", 0xEBF2, simple_load, GR32, 4>;
+ defm LOCG : CondUnaryRSYPair<"locg", 0xEBE2, simple_load, GR64, 8>;
// Store on condition. Expanded from CondStore* pseudos.
defm STOC : CondStoreRSYPair<"stoc", 0xEBF3, GR32, 4>;
@@ -2082,7 +2082,7 @@ let Predicates = [FeatureProcessorAssist] in {
// cleared. We only use the first result here.
let Defs = [CC] in
def FLOGR : UnaryRRE<"flogr", 0xB983, null_frag, GR128, GR64>;
-def : Pat<(ctlz GR64:$src),
+def : Pat<(i64 (ctlz GR64:$src)),
(EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
// Population count. Counts bits set per byte or doubleword.
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index 261727f89058..02364bbda5c1 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -60,7 +60,7 @@ let Predicates = [FeatureVector] in {
// Generate byte mask.
def VZERO : InherentVRIa<"vzero", 0xE744, 0>;
def VONE : InherentVRIa<"vone", 0xE744, 0xffff>;
- def VGBM : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>;
+ def VGBM : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16_timm>;
// Generate mask.
def VGM : BinaryVRIbGeneric<"vgm", 0xE746>;
@@ -71,10 +71,10 @@ let Predicates = [FeatureVector] in {
// Replicate immediate.
def VREPI : UnaryVRIaGeneric<"vrepi", 0xE745, imm32sx16>;
- def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>;
- def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>;
- def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>;
- def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>;
+ def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16_timm, 0>;
+ def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16_timm, 1>;
+ def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16_timm, 2>;
+ def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16_timm, 3>;
}
// Load element immediate.
@@ -116,7 +116,7 @@ let Predicates = [FeatureVector] in {
(ins bdxaddr12only:$XBD2, imm32zx4:$M3),
"lcbb\t$R1, $XBD2, $M3",
[(set GR32:$R1, (int_s390_lcbb bdxaddr12only:$XBD2,
- imm32zx4:$M3))]>;
+ imm32zx4_timm:$M3))]>;
// Load with length. The number of loaded bytes is only known at run time.
def VLL : BinaryVRSb<"vll", 0xE737, int_s390_vll, 0>;
@@ -362,9 +362,9 @@ let Predicates = [FeatureVector] in {
def VREPH : BinaryVRIc<"vreph", 0xE74D, z_splat, v128h, v128h, 1>;
def VREPF : BinaryVRIc<"vrepf", 0xE74D, z_splat, v128f, v128f, 2>;
def VREPG : BinaryVRIc<"vrepg", 0xE74D, z_splat, v128g, v128g, 3>;
- def : Pat<(v4f32 (z_splat VR128:$vec, imm32zx16:$index)),
+ def : Pat<(v4f32 (z_splat VR128:$vec, imm32zx16_timm:$index)),
(VREPF VR128:$vec, imm32zx16:$index)>;
- def : Pat<(v2f64 (z_splat VR128:$vec, imm32zx16:$index)),
+ def : Pat<(v2f64 (z_splat VR128:$vec, imm32zx16_timm:$index)),
(VREPG VR128:$vec, imm32zx16:$index)>;
// Select.
@@ -778,7 +778,7 @@ let Predicates = [FeatureVector] in {
// Shift left double by byte.
def VSLDB : TernaryVRId<"vsldb", 0xE777, z_shl_double, v128b, v128b, 0>;
- def : Pat<(int_s390_vsldb VR128:$x, VR128:$y, imm32zx8:$z),
+ def : Pat<(int_s390_vsldb VR128:$x, VR128:$y, imm32zx8_timm:$z),
(VSLDB VR128:$x, VR128:$y, imm32zx8:$z)>;
// Shift left double by bit.
@@ -1069,7 +1069,7 @@ let Predicates = [FeatureVector] in {
def WCGDB : TernaryVRRa<"wcgdb", 0xE7C2, null_frag, v64g, v64db, 3, 8>;
}
// Rounding mode should agree with SystemZInstrFP.td.
- def : FPConversion<VCGDB, fp_to_sint, v128g, v128db, 0, 5>;
+ def : FPConversion<VCGDB, any_fp_to_sint, v128g, v128db, 0, 5>;
let Predicates = [FeatureVectorEnhancements2] in {
let Uses = [FPC], mayRaiseFPException = 1 in {
let isAsmParserOnly = 1 in
@@ -1078,7 +1078,7 @@ let Predicates = [FeatureVector] in {
def WCFEB : TernaryVRRa<"wcfeb", 0xE7C2, null_frag, v32sb, v32f, 2, 8>;
}
// Rounding mode should agree with SystemZInstrFP.td.
- def : FPConversion<VCFEB, fp_to_sint, v128f, v128sb, 0, 5>;
+ def : FPConversion<VCFEB, any_fp_to_sint, v128f, v128sb, 0, 5>;
}
// Convert to logical.
@@ -1088,7 +1088,7 @@ let Predicates = [FeatureVector] in {
def WCLGDB : TernaryVRRa<"wclgdb", 0xE7C0, null_frag, v64g, v64db, 3, 8>;
}
// Rounding mode should agree with SystemZInstrFP.td.
- def : FPConversion<VCLGDB, fp_to_uint, v128g, v128db, 0, 5>;
+ def : FPConversion<VCLGDB, any_fp_to_uint, v128g, v128db, 0, 5>;
let Predicates = [FeatureVectorEnhancements2] in {
let Uses = [FPC], mayRaiseFPException = 1 in {
let isAsmParserOnly = 1 in
@@ -1097,7 +1097,7 @@ let Predicates = [FeatureVector] in {
def WCLFEB : TernaryVRRa<"wclfeb", 0xE7C0, null_frag, v32sb, v32f, 2, 8>;
}
// Rounding mode should agree with SystemZInstrFP.td.
- def : FPConversion<VCLFEB, fp_to_uint, v128f, v128sb, 0, 5>;
+ def : FPConversion<VCLFEB, any_fp_to_uint, v128f, v128sb, 0, 5>;
}
// Divide.
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index 95d7e22dec32..724111229569 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -85,9 +85,9 @@ struct MBBInfo {
// This value never changes.
uint64_t Size = 0;
- // The minimum alignment of the block, as a log2 value.
+ // The minimum alignment of the block.
// This value never changes.
- unsigned Alignment = 0;
+ Align Alignment;
// The number of terminators in this block. This value never changes.
unsigned NumTerminators = 0;
@@ -127,7 +127,8 @@ struct BlockPosition {
// as the runtime address.
unsigned KnownBits;
- BlockPosition(unsigned InitialAlignment) : KnownBits(InitialAlignment) {}
+ BlockPosition(unsigned InitialLogAlignment)
+ : KnownBits(InitialLogAlignment) {}
};
class SystemZLongBranch : public MachineFunctionPass {
@@ -178,17 +179,16 @@ const uint64_t MaxForwardRange = 0xfffe;
// instructions.
void SystemZLongBranch::skipNonTerminators(BlockPosition &Position,
MBBInfo &Block) {
- if (Block.Alignment > Position.KnownBits) {
+ if (Log2(Block.Alignment) > Position.KnownBits) {
// When calculating the address of Block, we need to conservatively
// assume that Block had the worst possible misalignment.
- Position.Address += ((uint64_t(1) << Block.Alignment) -
- (uint64_t(1) << Position.KnownBits));
- Position.KnownBits = Block.Alignment;
+ Position.Address +=
+ (Block.Alignment.value() - (uint64_t(1) << Position.KnownBits));
+ Position.KnownBits = Log2(Block.Alignment);
}
// Align the addresses.
- uint64_t AlignMask = (uint64_t(1) << Block.Alignment) - 1;
- Position.Address = (Position.Address + AlignMask) & ~AlignMask;
+ Position.Address = alignTo(Position.Address, Block.Alignment);
// Record the block's position.
Block.Address = Position.Address;
@@ -257,7 +257,7 @@ TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr &MI) {
}
Terminator.Branch = &MI;
Terminator.TargetBlock =
- TII->getBranchInfo(MI).Target->getMBB()->getNumber();
+ TII->getBranchInfo(MI).getMBBTarget()->getNumber();
}
return Terminator;
}
@@ -275,7 +275,7 @@ uint64_t SystemZLongBranch::initMBBInfo() {
Terminators.clear();
Terminators.reserve(NumBlocks);
- BlockPosition Position(MF->getAlignment());
+ BlockPosition Position(Log2(MF->getAlignment()));
for (unsigned I = 0; I < NumBlocks; ++I) {
MachineBasicBlock *MBB = MF->getBlockNumbered(I);
MBBInfo &Block = MBBs[I];
@@ -339,7 +339,7 @@ bool SystemZLongBranch::mustRelaxABranch() {
// must be long.
void SystemZLongBranch::setWorstCaseAddresses() {
SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
- BlockPosition Position(MF->getAlignment());
+ BlockPosition Position(Log2(MF->getAlignment()));
for (auto &Block : MBBs) {
skipNonTerminators(Position, Block);
for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) {
@@ -440,7 +440,7 @@ void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
// Run a shortening pass and relax any branches that need to be relaxed.
void SystemZLongBranch::relaxBranches() {
SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
- BlockPosition Position(MF->getAlignment());
+ BlockPosition Position(Log2(MF->getAlignment()));
for (auto &Block : MBBs) {
skipNonTerminators(Position, Block);
for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) {
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index 0becfaa1d49c..3fc25034dded 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -15,6 +15,7 @@
//===----------------------------------------------------------------------===//
#include "SystemZMachineScheduler.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
using namespace llvm;
@@ -108,8 +109,8 @@ void SystemZPostRASchedStrategy::enterMBB(MachineBasicBlock *NextMBB) {
I != SinglePredMBB->end(); I++) {
LLVM_DEBUG(dbgs() << "** Emitting incoming branch: "; I->dump(););
bool TakenBranch = (I->isBranch() &&
- (TII->getBranchInfo(*I).Target->isReg() || // Relative branch
- TII->getBranchInfo(*I).Target->getMBB() == MBB));
+ (TII->getBranchInfo(*I).isIndirect() ||
+ TII->getBranchInfo(*I).getMBBTarget() == MBB));
HazardRec->emitInstruction(&*I, TakenBranch);
if (TakenBranch)
break;
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 56632e1529a2..b2bab68a6274 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -21,15 +21,32 @@ class ImmediateTLSAsmOperand<string name>
let RenderMethod = "addImmTLSOperands";
}
+class ImmediateOp<ValueType vt, string asmop> : Operand<vt> {
+ let PrintMethod = "print"##asmop##"Operand";
+ let DecoderMethod = "decode"##asmop##"Operand";
+ let ParserMatchClass = !cast<AsmOperandClass>(asmop);
+}
+
+class ImmOpWithPattern<ValueType vt, string asmop, code pred, SDNodeXForm xform,
+ SDNode ImmNode = imm> :
+ ImmediateOp<vt, asmop>, PatLeaf<(vt ImmNode), pred, xform>;
+
+// class ImmediatePatLeaf<ValueType vt, code pred,
+// SDNodeXForm xform, SDNode ImmNode>
+// : PatLeaf<(vt ImmNode), pred, xform>;
+
+
// Constructs both a DAG pattern and instruction operand for an immediate
// of type VT. PRED returns true if a node is acceptable and XFORM returns
// the operand value associated with the node. ASMOP is the name of the
// associated asm operand, and also forms the basis of the asm print method.
-class Immediate<ValueType vt, code pred, SDNodeXForm xform, string asmop>
- : PatLeaf<(vt imm), pred, xform>, Operand<vt> {
- let PrintMethod = "print"##asmop##"Operand";
- let DecoderMethod = "decode"##asmop##"Operand";
- let ParserMatchClass = !cast<AsmOperandClass>(asmop);
+multiclass Immediate<ValueType vt, code pred, SDNodeXForm xform, string asmop> {
+ // def "" : ImmediateOp<vt, asmop>,
+ // PatLeaf<(vt imm), pred, xform>;
+ def "" : ImmOpWithPattern<vt, asmop, pred, xform>;
+
+// def _timm : PatLeaf<(vt timm), pred, xform>;
+ def _timm : ImmOpWithPattern<vt, asmop, pred, xform, timm>;
}
// Constructs an asm operand for a PC-relative address. SIZE says how
@@ -295,87 +312,87 @@ def U48Imm : ImmediateAsmOperand<"U48Imm">;
// Immediates for the lower and upper 16 bits of an i32, with the other
// bits of the i32 being zero.
-def imm32ll16 : Immediate<i32, [{
+defm imm32ll16 : Immediate<i32, [{
return SystemZ::isImmLL(N->getZExtValue());
}], LL16, "U16Imm">;
-def imm32lh16 : Immediate<i32, [{
+defm imm32lh16 : Immediate<i32, [{
return SystemZ::isImmLH(N->getZExtValue());
}], LH16, "U16Imm">;
// Immediates for the lower and upper 16 bits of an i32, with the other
// bits of the i32 being one.
-def imm32ll16c : Immediate<i32, [{
+defm imm32ll16c : Immediate<i32, [{
return SystemZ::isImmLL(uint32_t(~N->getZExtValue()));
}], LL16, "U16Imm">;
-def imm32lh16c : Immediate<i32, [{
+defm imm32lh16c : Immediate<i32, [{
return SystemZ::isImmLH(uint32_t(~N->getZExtValue()));
}], LH16, "U16Imm">;
// Short immediates
-def imm32zx1 : Immediate<i32, [{
+defm imm32zx1 : Immediate<i32, [{
return isUInt<1>(N->getZExtValue());
}], NOOP_SDNodeXForm, "U1Imm">;
-def imm32zx2 : Immediate<i32, [{
+defm imm32zx2 : Immediate<i32, [{
return isUInt<2>(N->getZExtValue());
}], NOOP_SDNodeXForm, "U2Imm">;
-def imm32zx3 : Immediate<i32, [{
+defm imm32zx3 : Immediate<i32, [{
return isUInt<3>(N->getZExtValue());
}], NOOP_SDNodeXForm, "U3Imm">;
-def imm32zx4 : Immediate<i32, [{
+defm imm32zx4 : Immediate<i32, [{
return isUInt<4>(N->getZExtValue());
}], NOOP_SDNodeXForm, "U4Imm">;
// Note: this enforces an even value during code generation only.
// When used from the assembler, any 4-bit value is allowed.
-def imm32zx4even : Immediate<i32, [{
+defm imm32zx4even : Immediate<i32, [{
return isUInt<4>(N->getZExtValue());
}], UIMM8EVEN, "U4Imm">;
-def imm32zx6 : Immediate<i32, [{
+defm imm32zx6 : Immediate<i32, [{
return isUInt<6>(N->getZExtValue());
}], NOOP_SDNodeXForm, "U6Imm">;
-def imm32sx8 : Immediate<i32, [{
+defm imm32sx8 : Immediate<i32, [{
return isInt<8>(N->getSExtValue());
}], SIMM8, "S8Imm">;
-def imm32zx8 : Immediate<i32, [{
+defm imm32zx8 : Immediate<i32, [{
return isUInt<8>(N->getZExtValue());
}], UIMM8, "U8Imm">;
-def imm32zx8trunc : Immediate<i32, [{}], UIMM8, "U8Imm">;
+defm imm32zx8trunc : Immediate<i32, [{}], UIMM8, "U8Imm">;
-def imm32zx12 : Immediate<i32, [{
+defm imm32zx12 : Immediate<i32, [{
return isUInt<12>(N->getZExtValue());
}], UIMM12, "U12Imm">;
-def imm32sx16 : Immediate<i32, [{
+defm imm32sx16 : Immediate<i32, [{
return isInt<16>(N->getSExtValue());
}], SIMM16, "S16Imm">;
-def imm32sx16n : Immediate<i32, [{
+defm imm32sx16n : Immediate<i32, [{
return isInt<16>(-N->getSExtValue());
}], NEGSIMM16, "S16Imm">;
-def imm32zx16 : Immediate<i32, [{
+defm imm32zx16 : Immediate<i32, [{
return isUInt<16>(N->getZExtValue());
}], UIMM16, "U16Imm">;
-def imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">;
-def imm32zx16trunc : Immediate<i32, [{}], UIMM16, "U16Imm">;
+defm imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">;
+defm imm32zx16trunc : Immediate<i32, [{}], UIMM16, "U16Imm">;
// Full 32-bit immediates. we need both signed and unsigned versions
// because the assembler is picky. E.g. AFI requires signed operands
// while NILF requires unsigned ones.
-def simm32 : Immediate<i32, [{}], SIMM32, "S32Imm">;
-def uimm32 : Immediate<i32, [{}], UIMM32, "U32Imm">;
+defm simm32 : Immediate<i32, [{}], SIMM32, "S32Imm">;
+defm uimm32 : Immediate<i32, [{}], UIMM32, "U32Imm">;
-def simm32n : Immediate<i32, [{
+defm simm32n : Immediate<i32, [{
return isInt<32>(-N->getSExtValue());
}], NEGSIMM32, "S32Imm">;
@@ -387,107 +404,107 @@ def imm32 : ImmLeaf<i32, [{}]>;
// Immediates for 16-bit chunks of an i64, with the other bits of the
// i32 being zero.
-def imm64ll16 : Immediate<i64, [{
+defm imm64ll16 : Immediate<i64, [{
return SystemZ::isImmLL(N->getZExtValue());
}], LL16, "U16Imm">;
-def imm64lh16 : Immediate<i64, [{
+defm imm64lh16 : Immediate<i64, [{
return SystemZ::isImmLH(N->getZExtValue());
}], LH16, "U16Imm">;
-def imm64hl16 : Immediate<i64, [{
+defm imm64hl16 : Immediate<i64, [{
return SystemZ::isImmHL(N->getZExtValue());
}], HL16, "U16Imm">;
-def imm64hh16 : Immediate<i64, [{
+defm imm64hh16 : Immediate<i64, [{
return SystemZ::isImmHH(N->getZExtValue());
}], HH16, "U16Imm">;
// Immediates for 16-bit chunks of an i64, with the other bits of the
// i32 being one.
-def imm64ll16c : Immediate<i64, [{
+defm imm64ll16c : Immediate<i64, [{
return SystemZ::isImmLL(uint64_t(~N->getZExtValue()));
}], LL16, "U16Imm">;
-def imm64lh16c : Immediate<i64, [{
+defm imm64lh16c : Immediate<i64, [{
return SystemZ::isImmLH(uint64_t(~N->getZExtValue()));
}], LH16, "U16Imm">;
-def imm64hl16c : Immediate<i64, [{
+defm imm64hl16c : Immediate<i64, [{
return SystemZ::isImmHL(uint64_t(~N->getZExtValue()));
}], HL16, "U16Imm">;
-def imm64hh16c : Immediate<i64, [{
+defm imm64hh16c : Immediate<i64, [{
return SystemZ::isImmHH(uint64_t(~N->getZExtValue()));
}], HH16, "U16Imm">;
// Immediates for the lower and upper 32 bits of an i64, with the other
// bits of the i32 being zero.
-def imm64lf32 : Immediate<i64, [{
+defm imm64lf32 : Immediate<i64, [{
return SystemZ::isImmLF(N->getZExtValue());
}], LF32, "U32Imm">;
-def imm64hf32 : Immediate<i64, [{
+defm imm64hf32 : Immediate<i64, [{
return SystemZ::isImmHF(N->getZExtValue());
}], HF32, "U32Imm">;
// Immediates for the lower and upper 32 bits of an i64, with the other
// bits of the i32 being one.
-def imm64lf32c : Immediate<i64, [{
+defm imm64lf32c : Immediate<i64, [{
return SystemZ::isImmLF(uint64_t(~N->getZExtValue()));
}], LF32, "U32Imm">;
-def imm64hf32c : Immediate<i64, [{
+defm imm64hf32c : Immediate<i64, [{
return SystemZ::isImmHF(uint64_t(~N->getZExtValue()));
}], HF32, "U32Imm">;
// Negated immediates that fit LF32 or LH16.
-def imm64lh16n : Immediate<i64, [{
+defm imm64lh16n : Immediate<i64, [{
return SystemZ::isImmLH(uint64_t(-N->getZExtValue()));
}], NEGLH16, "U16Imm">;
-def imm64lf32n : Immediate<i64, [{
+defm imm64lf32n : Immediate<i64, [{
return SystemZ::isImmLF(uint64_t(-N->getZExtValue()));
}], NEGLF32, "U32Imm">;
// Short immediates.
-def imm64sx8 : Immediate<i64, [{
+defm imm64sx8 : Immediate<i64, [{
return isInt<8>(N->getSExtValue());
}], SIMM8, "S8Imm">;
-def imm64zx8 : Immediate<i64, [{
+defm imm64zx8 : Immediate<i64, [{
return isUInt<8>(N->getSExtValue());
}], UIMM8, "U8Imm">;
-def imm64sx16 : Immediate<i64, [{
+defm imm64sx16 : Immediate<i64, [{
return isInt<16>(N->getSExtValue());
}], SIMM16, "S16Imm">;
-def imm64sx16n : Immediate<i64, [{
+defm imm64sx16n : Immediate<i64, [{
return isInt<16>(-N->getSExtValue());
}], NEGSIMM16, "S16Imm">;
-def imm64zx16 : Immediate<i64, [{
+defm imm64zx16 : Immediate<i64, [{
return isUInt<16>(N->getZExtValue());
}], UIMM16, "U16Imm">;
-def imm64sx32 : Immediate<i64, [{
+defm imm64sx32 : Immediate<i64, [{
return isInt<32>(N->getSExtValue());
}], SIMM32, "S32Imm">;
-def imm64sx32n : Immediate<i64, [{
+defm imm64sx32n : Immediate<i64, [{
return isInt<32>(-N->getSExtValue());
}], NEGSIMM32, "S32Imm">;
-def imm64zx32 : Immediate<i64, [{
+defm imm64zx32 : Immediate<i64, [{
return isUInt<32>(N->getZExtValue());
}], UIMM32, "U32Imm">;
-def imm64zx32n : Immediate<i64, [{
+defm imm64zx32n : Immediate<i64, [{
return isUInt<32>(-N->getSExtValue());
}], NEGUIMM32, "U32Imm">;
-def imm64zx48 : Immediate<i64, [{
+defm imm64zx48 : Immediate<i64, [{
return isUInt<64>(N->getZExtValue());
}], UIMM48, "U48Imm">;
@@ -637,7 +654,7 @@ def bdvaddr12only : BDVMode< "64", "12">;
//===----------------------------------------------------------------------===//
// A 4-bit condition-code mask.
-def cond4 : PatLeaf<(i32 imm), [{ return (N->getZExtValue() < 16); }]>,
+def cond4 : PatLeaf<(i32 timm), [{ return (N->getZExtValue() < 16); }]>,
Operand<i32> {
let PrintMethod = "printCond4Operand";
}
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 15bd12bc98a4..6fe383e64b74 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -472,17 +472,17 @@ def z_subcarry : PatFrag<(ops node:$lhs, node:$rhs),
(z_subcarry_1 node:$lhs, node:$rhs, CC)>;
// Signed and unsigned comparisons.
-def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, timm), [{
unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
return Type != SystemZICMP::UnsignedOnly;
}]>;
-def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, timm), [{
unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
return Type != SystemZICMP::SignedOnly;
}]>;
// Register- and memory-based TEST UNDER MASK.
-def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, imm)>;
+def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, timm)>;
def z_tm_mem : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, 0)>;
// Register sign-extend operations. Sub-32-bit values are represented as i32s.
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
index beaf4de285a3..65300fb47627 100644
--- a/lib/Target/SystemZ/SystemZPatterns.td
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -100,12 +100,12 @@ multiclass CondStores64<Instruction insn, Instruction insninv,
SDPatternOperator store, SDPatternOperator load,
AddressingMode mode> {
def : Pat<(store (z_select_ccmask GR64:$new, (load mode:$addr),
- imm32zx4:$valid, imm32zx4:$cc),
+ imm32zx4_timm:$valid, imm32zx4_timm:$cc),
mode:$addr),
(insn (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
imm32zx4:$valid, imm32zx4:$cc)>;
def : Pat<(store (z_select_ccmask (load mode:$addr), GR64:$new,
- imm32zx4:$valid, imm32zx4:$cc),
+ imm32zx4_timm:$valid, imm32zx4_timm:$cc),
mode:$addr),
(insninv (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
imm32zx4:$valid, imm32zx4:$cc)>;
diff --git a/lib/Target/SystemZ/SystemZPostRewrite.cpp b/lib/Target/SystemZ/SystemZPostRewrite.cpp
index 8e4060eac74c..aaa7f8fc88f5 100644
--- a/lib/Target/SystemZ/SystemZPostRewrite.cpp
+++ b/lib/Target/SystemZ/SystemZPostRewrite.cpp
@@ -25,6 +25,7 @@ using namespace llvm;
#define DEBUG_TYPE "systemz-postrewrite"
STATISTIC(MemFoldCopies, "Number of copies inserted before folded mem ops.");
+STATISTIC(LOCRMuxJumps, "Number of LOCRMux jump-sequences (lower is better)");
namespace llvm {
void initializeSystemZPostRewritePass(PassRegistry&);
@@ -45,12 +46,20 @@ public:
StringRef getPassName() const override { return SYSTEMZ_POSTREWRITE_NAME; }
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
private:
+ void selectLOCRMux(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI,
+ unsigned LowOpcode,
+ unsigned HighOpcode);
+ void selectSELRMux(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI,
+ unsigned LowOpcode,
+ unsigned HighOpcode);
+ bool expandCondMove(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI);
bool selectMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineBasicBlock::iterator &NextMBBI);
bool selectMBB(MachineBasicBlock &MBB);
@@ -68,11 +77,141 @@ FunctionPass *llvm::createSystemZPostRewritePass(SystemZTargetMachine &TM) {
return new SystemZPostRewrite();
}
+// MI is a load-register-on-condition pseudo instruction. Replace it with
+// LowOpcode if source and destination are both low GR32s and HighOpcode if
+// source and destination are both high GR32s. Otherwise, a branch sequence
+// is created.
+void SystemZPostRewrite::selectLOCRMux(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI,
+ unsigned LowOpcode,
+ unsigned HighOpcode) {
+ Register DestReg = MBBI->getOperand(0).getReg();
+ Register SrcReg = MBBI->getOperand(2).getReg();
+ bool DestIsHigh = SystemZ::isHighReg(DestReg);
+ bool SrcIsHigh = SystemZ::isHighReg(SrcReg);
+
+ if (!DestIsHigh && !SrcIsHigh)
+ MBBI->setDesc(TII->get(LowOpcode));
+ else if (DestIsHigh && SrcIsHigh)
+ MBBI->setDesc(TII->get(HighOpcode));
+ else
+ expandCondMove(MBB, MBBI, NextMBBI);
+}
+
+// MI is a select pseudo instruction. Replace it with LowOpcode if source
+// and destination are all low GR32s and HighOpcode if source and destination
+// are all high GR32s. Otherwise, a branch sequence is created.
+void SystemZPostRewrite::selectSELRMux(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI,
+ unsigned LowOpcode,
+ unsigned HighOpcode) {
+ Register DestReg = MBBI->getOperand(0).getReg();
+ Register Src1Reg = MBBI->getOperand(1).getReg();
+ Register Src2Reg = MBBI->getOperand(2).getReg();
+ bool DestIsHigh = SystemZ::isHighReg(DestReg);
+ bool Src1IsHigh = SystemZ::isHighReg(Src1Reg);
+ bool Src2IsHigh = SystemZ::isHighReg(Src2Reg);
+
+ // If sources and destination aren't all high or all low, we may be able to
+ // simplify the operation by moving one of the sources to the destination
+ // first. But only if this doesn't clobber the other source.
+ if (DestReg != Src1Reg && DestReg != Src2Reg) {
+ if (DestIsHigh != Src1IsHigh) {
+ BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(),
+ TII->get(SystemZ::COPY), DestReg)
+ .addReg(MBBI->getOperand(1).getReg(), getRegState(MBBI->getOperand(1)));
+ MBBI->getOperand(1).setReg(DestReg);
+ Src1Reg = DestReg;
+ Src1IsHigh = DestIsHigh;
+ } else if (DestIsHigh != Src2IsHigh) {
+ BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(),
+ TII->get(SystemZ::COPY), DestReg)
+ .addReg(MBBI->getOperand(2).getReg(), getRegState(MBBI->getOperand(2)));
+ MBBI->getOperand(2).setReg(DestReg);
+ Src2Reg = DestReg;
+ Src2IsHigh = DestIsHigh;
+ }
+ }
+
+ // If the destination (now) matches one source, prefer this to be first.
+ if (DestReg != Src1Reg && DestReg == Src2Reg) {
+ TII->commuteInstruction(*MBBI, false, 1, 2);
+ std::swap(Src1Reg, Src2Reg);
+ std::swap(Src1IsHigh, Src2IsHigh);
+ }
+
+ if (!DestIsHigh && !Src1IsHigh && !Src2IsHigh)
+ MBBI->setDesc(TII->get(LowOpcode));
+ else if (DestIsHigh && Src1IsHigh && Src2IsHigh)
+ MBBI->setDesc(TII->get(HighOpcode));
+ else
+ // Given the simplification above, we must already have a two-operand case.
+ expandCondMove(MBB, MBBI, NextMBBI);
+}
+
+// Replace MBBI by a branch sequence that performs a conditional move of
+// operand 2 to the destination register. Operand 1 is expected to be the
+// same register as the destination.
+bool SystemZPostRewrite::expandCondMove(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
+ MachineFunction &MF = *MBB.getParent();
+ const BasicBlock *BB = MBB.getBasicBlock();
+ MachineInstr &MI = *MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ Register DestReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(2).getReg();
+ unsigned CCValid = MI.getOperand(3).getImm();
+ unsigned CCMask = MI.getOperand(4).getImm();
+ assert(DestReg == MI.getOperand(1).getReg() &&
+ "Expected destination and first source operand to be the same.");
+
+ LivePhysRegs LiveRegs(TII->getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+ LiveRegs.stepBackward(*I);
+
+ // Splice MBB at MI, moving the rest of the block into RestMBB.
+ MachineBasicBlock *RestMBB = MF.CreateMachineBasicBlock(BB);
+ MF.insert(std::next(MachineFunction::iterator(MBB)), RestMBB);
+ RestMBB->splice(RestMBB->begin(), &MBB, MI, MBB.end());
+ RestMBB->transferSuccessors(&MBB);
+ for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+ RestMBB->addLiveIn(*I);
+
+ // Create a new block MoveMBB to hold the move instruction.
+ MachineBasicBlock *MoveMBB = MF.CreateMachineBasicBlock(BB);
+ MF.insert(std::next(MachineFunction::iterator(MBB)), MoveMBB);
+ MoveMBB->addLiveIn(SrcReg);
+ for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+ MoveMBB->addLiveIn(*I);
+
+ // At the end of MBB, create a conditional branch to RestMBB if the
+ // condition is false, otherwise fall through to MoveMBB.
+ BuildMI(&MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(CCValid).addImm(CCMask ^ CCValid).addMBB(RestMBB);
+ MBB.addSuccessor(RestMBB);
+ MBB.addSuccessor(MoveMBB);
+
+ // In MoveMBB, emit an instruction to move SrcReg into DestReg,
+ // then fall through to RestMBB.
+ BuildMI(*MoveMBB, MoveMBB->end(), DL, TII->get(SystemZ::COPY), DestReg)
+ .addReg(MI.getOperand(2).getReg(), getRegState(MI.getOperand(2)));
+ MoveMBB->addSuccessor(RestMBB);
+
+ NextMBBI = MBB.end();
+ MI.eraseFromParent();
+ LOCRMuxJumps++;
+ return true;
+}
+
/// If MBBI references a pseudo instruction that should be selected here,
/// do it and return true. Otherwise return false.
bool SystemZPostRewrite::selectMI(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- MachineBasicBlock::iterator &NextMBBI) {
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI) {
MachineInstr &MI = *MBBI;
unsigned Opcode = MI.getOpcode();
@@ -83,7 +222,7 @@ bool SystemZPostRewrite::selectMI(MachineBasicBlock &MBB,
if (TargetMemOpcode != -1) {
MI.setDesc(TII->get(TargetMemOpcode));
MI.tieOperands(0, 1);
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
MachineOperand &SrcMO = MI.getOperand(1);
if (DstReg != SrcMO.getReg()) {
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(SystemZ::COPY), DstReg)
@@ -94,6 +233,15 @@ bool SystemZPostRewrite::selectMI(MachineBasicBlock &MBB,
return true;
}
+ switch (Opcode) {
+ case SystemZ::LOCRMux:
+ selectLOCRMux(MBB, MBBI, NextMBBI, SystemZ::LOCR, SystemZ::LOCFHR);
+ return true;
+ case SystemZ::SELRMux:
+ selectSELRMux(MBB, MBBI, NextMBBI, SystemZ::SELR, SystemZ::SELFHR);
+ return true;
+ }
+
return false;
}
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
index b27c25beb58c..af33a0300552 100644
--- a/lib/Target/SystemZ/SystemZProcessors.td
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -35,5 +35,6 @@ def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>;
def : ProcessorModel<"arch12", Z14Model, Arch12SupportedFeatures.List>;
def : ProcessorModel<"z14", Z14Model, Arch12SupportedFeatures.List>;
-def : ProcessorModel<"arch13", Arch13Model, Arch13SupportedFeatures.List>;
+def : ProcessorModel<"arch13", Z15Model, Arch13SupportedFeatures.List>;
+def : ProcessorModel<"z15", Z15Model, Arch13SupportedFeatures.List>;
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index e7cd6871dbb4..39ace5594b7f 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -41,7 +41,7 @@ static const TargetRegisterClass *getRC32(MachineOperand &MO,
return &SystemZ::GRH32BitRegClass;
if (VRM && VRM->hasPhys(MO.getReg())) {
- unsigned PhysReg = VRM->getPhys(MO.getReg());
+ Register PhysReg = VRM->getPhys(MO.getReg());
if (SystemZ::GR32BitRegClass.contains(PhysReg))
return &SystemZ::GR32BitRegClass;
assert (SystemZ::GRH32BitRegClass.contains(PhysReg) &&
@@ -120,8 +120,8 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
}
// Add the other operand of the LOCRMux to the worklist.
- unsigned OtherReg =
- (TrueMO.getReg() == Reg ? FalseMO.getReg() : TrueMO.getReg());
+ Register OtherReg =
+ (TrueMO.getReg() == Reg ? FalseMO.getReg() : TrueMO.getReg());
if (MRI->getRegClass(OtherReg) == &SystemZ::GRX32BitRegClass)
Worklist.push_back(OtherReg);
} // end LOCRMux
@@ -169,7 +169,8 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
auto tryAddHint = [&](const MachineOperand *MO) -> void {
Register Reg = MO->getReg();
- Register PhysReg = isPhysicalRegister(Reg) ? Reg : VRM->getPhys(Reg);
+ Register PhysReg =
+ Register::isPhysicalRegister(Reg) ? Reg : VRM->getPhys(Reg);
if (PhysReg) {
if (MO->getSubReg())
PhysReg = getSubReg(PhysReg, MO->getSubReg());
@@ -297,8 +298,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(Mask && "One offset must be OK");
} while (!OpcodeForOffset);
- unsigned ScratchReg =
- MF.getRegInfo().createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ Register ScratchReg =
+ MF.getRegInfo().createVirtualRegister(&SystemZ::ADDR64BitRegClass);
int64_t HighOffset = OldOffset - Offset;
if (MI->getDesc().TSFlags & SystemZII::HasIndex
@@ -351,8 +352,8 @@ bool SystemZRegisterInfo::shouldCoalesce(MachineInstr *MI,
// regalloc may run out of registers.
unsigned WideOpNo = (getRegSizeInBits(*SrcRC) == 128 ? 1 : 0);
- unsigned GR128Reg = MI->getOperand(WideOpNo).getReg();
- unsigned GRNarReg = MI->getOperand((WideOpNo == 1) ? 0 : 1).getReg();
+ Register GR128Reg = MI->getOperand(WideOpNo).getReg();
+ Register GRNarReg = MI->getOperand((WideOpNo == 1) ? 0 : 1).getReg();
LiveInterval &IntGR128 = LIS.getInterval(GR128Reg);
LiveInterval &IntGRNar = LIS.getInterval(GRNarReg);
@@ -385,7 +386,7 @@ bool SystemZRegisterInfo::shouldCoalesce(MachineInstr *MI,
MEE++;
for (; MII != MEE; ++MII) {
for (const MachineOperand &MO : MII->operands())
- if (MO.isReg() && isPhysicalRegister(MO.getReg())) {
+ if (MO.isReg() && Register::isPhysicalRegister(MO.getReg())) {
for (MCSuperRegIterator SI(MO.getReg(), this, true/*IncludeSelf*/);
SI.isValid(); ++SI)
if (NewRC->contains(*SI)) {
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 4f721ec23e53..7044efef1ac6 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -28,6 +28,15 @@ inline unsigned even128(bool Is32bit) {
inline unsigned odd128(bool Is32bit) {
return Is32bit ? subreg_l32 : subreg_l64;
}
+
+// Reg should be a 32-bit GPR. Return true if it is a high register rather
+// than a low register.
+inline bool isHighReg(unsigned int Reg) {
+ if (SystemZ::GRH32BitRegClass.contains(Reg))
+ return true;
+ assert(SystemZ::GR32BitRegClass.contains(Reg) && "Invalid GRX32");
+ return false;
+}
} // end namespace SystemZ
struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td
index 98eca2802242..119e3ee7c22c 100644
--- a/lib/Target/SystemZ/SystemZSchedule.td
+++ b/lib/Target/SystemZ/SystemZSchedule.td
@@ -59,7 +59,7 @@ def VBU : SchedWrite; // Virtual branching unit
def MCD : SchedWrite; // Millicode
-include "SystemZScheduleArch13.td"
+include "SystemZScheduleZ15.td"
include "SystemZScheduleZ14.td"
include "SystemZScheduleZ13.td"
include "SystemZScheduleZEC12.td"
diff --git a/lib/Target/SystemZ/SystemZScheduleArch13.td b/lib/Target/SystemZ/SystemZScheduleZ15.td
index 9f82f24d0e8f..56ceb88f35d4 100644
--- a/lib/Target/SystemZ/SystemZScheduleArch13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -1,4 +1,4 @@
-//-- SystemZScheduleArch13.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
+//-- SystemZScheduleZ15.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,14 +6,14 @@
//
//===----------------------------------------------------------------------===//
//
-// This file defines the machine model for Arch13 to support instruction
+// This file defines the machine model for Z15 to support instruction
// scheduling and other instruction cost heuristics.
//
// Pseudos expanded right after isel do not need to be modelled here.
//
//===----------------------------------------------------------------------===//
-def Arch13Model : SchedMachineModel {
+def Z15Model : SchedMachineModel {
let UnsupportedFeatures = Arch13UnsupportedFeatures.List;
@@ -27,7 +27,7 @@ def Arch13Model : SchedMachineModel {
let MispredictPenalty = 20;
}
-let SchedModel = Arch13Model in {
+let SchedModel = Z15Model in {
// These definitions need the SchedModel value. They could be put in a
// subtarget common include file, but it seems the include system in Tablegen
// currently (2016) rejects multiple includes of same file.
@@ -73,43 +73,43 @@ let NumMicroOps = 0 in {
}
// Execution units.
-def Arch13_FXaUnit : ProcResource<2>;
-def Arch13_FXbUnit : ProcResource<2>;
-def Arch13_LSUnit : ProcResource<2>;
-def Arch13_VecUnit : ProcResource<2>;
-def Arch13_VecFPdUnit : ProcResource<2> { let BufferSize = 1; /* blocking */ }
-def Arch13_VBUnit : ProcResource<2>;
-def Arch13_MCD : ProcResource<1>;
+def Z15_FXaUnit : ProcResource<2>;
+def Z15_FXbUnit : ProcResource<2>;
+def Z15_LSUnit : ProcResource<2>;
+def Z15_VecUnit : ProcResource<2>;
+def Z15_VecFPdUnit : ProcResource<2> { let BufferSize = 1; /* blocking */ }
+def Z15_VBUnit : ProcResource<2>;
+def Z15_MCD : ProcResource<1>;
// Subtarget specific definitions of scheduling resources.
let NumMicroOps = 0 in {
- def : WriteRes<FXa, [Arch13_FXaUnit]>;
- def : WriteRes<FXb, [Arch13_FXbUnit]>;
- def : WriteRes<LSU, [Arch13_LSUnit]>;
- def : WriteRes<VecBF, [Arch13_VecUnit]>;
- def : WriteRes<VecDF, [Arch13_VecUnit]>;
- def : WriteRes<VecDFX, [Arch13_VecUnit]>;
- def : WriteRes<VecMul, [Arch13_VecUnit]>;
- def : WriteRes<VecStr, [Arch13_VecUnit]>;
- def : WriteRes<VecXsPm, [Arch13_VecUnit]>;
+ def : WriteRes<FXa, [Z15_FXaUnit]>;
+ def : WriteRes<FXb, [Z15_FXbUnit]>;
+ def : WriteRes<LSU, [Z15_LSUnit]>;
+ def : WriteRes<VecBF, [Z15_VecUnit]>;
+ def : WriteRes<VecDF, [Z15_VecUnit]>;
+ def : WriteRes<VecDFX, [Z15_VecUnit]>;
+ def : WriteRes<VecMul, [Z15_VecUnit]>;
+ def : WriteRes<VecStr, [Z15_VecUnit]>;
+ def : WriteRes<VecXsPm, [Z15_VecUnit]>;
foreach Num = 2-5 in { let ResourceCycles = [Num] in {
- def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Arch13_FXaUnit]>;
- def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Arch13_FXbUnit]>;
- def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Arch13_LSUnit]>;
- def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Arch13_VecUnit]>;
- def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Arch13_VecUnit]>;
- def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Arch13_VecUnit]>;
- def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Arch13_VecUnit]>;
- def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Arch13_VecUnit]>;
- def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Arch13_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("FXa"#Num), [Z15_FXaUnit]>;
+ def : WriteRes<!cast<SchedWrite>("FXb"#Num), [Z15_FXbUnit]>;
+ def : WriteRes<!cast<SchedWrite>("LSU"#Num), [Z15_LSUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecBF"#Num), [Z15_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecDF"#Num), [Z15_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecDFX"#Num), [Z15_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecMul"#Num), [Z15_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecStr"#Num), [Z15_VecUnit]>;
+ def : WriteRes<!cast<SchedWrite>("VecXsPm"#Num), [Z15_VecUnit]>;
}}
- def : WriteRes<VecFPd, [Arch13_VecFPdUnit]> { let ResourceCycles = [30]; }
+ def : WriteRes<VecFPd, [Z15_VecFPdUnit]> { let ResourceCycles = [30]; }
- def : WriteRes<VBU, [Arch13_VBUnit]>; // Virtual Branching Unit
+ def : WriteRes<VBU, [Z15_VBUnit]>; // Virtual Branching Unit
}
-def : WriteRes<MCD, [Arch13_MCD]> { let NumMicroOps = 3;
+def : WriteRes<MCD, [Z15_MCD]> { let NumMicroOps = 3;
let BeginGroup = 1;
let EndGroup = 1; }
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index a50e6aa59711..47c925dcf730 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -209,10 +209,10 @@ std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemchr(
// Now select between End and null, depending on whether the character
// was found.
- SDValue Ops[] = {End, DAG.getConstant(0, DL, PtrVT),
- DAG.getConstant(SystemZ::CCMASK_SRST, DL, MVT::i32),
- DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, DL, MVT::i32),
- CCReg};
+ SDValue Ops[] = {
+ End, DAG.getConstant(0, DL, PtrVT),
+ DAG.getTargetConstant(SystemZ::CCMASK_SRST, DL, MVT::i32),
+ DAG.getTargetConstant(SystemZ::CCMASK_SRST_FOUND, DL, MVT::i32), CCReg};
End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, PtrVT, Ops);
return std::make_pair(End, Chain);
}
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index e79dfc5b4b9e..2aca22c9082a 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -75,7 +75,7 @@ static void tieOpsIfNeeded(MachineInstr &MI) {
// instead of IIxF.
bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned LLIxL,
unsigned LLIxH) {
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
// The new opcode will clear the other half of the GR64 reg, so
// cancel if that is live.
unsigned thisSubRegIdx =
@@ -86,7 +86,7 @@ bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned LLIxL,
: SystemZ::subreg_l32);
unsigned GR64BitReg =
TRI->getMatchingSuperReg(Reg, thisSubRegIdx, &SystemZ::GR64BitRegClass);
- unsigned OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx);
+ Register OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx);
if (LiveRegs.contains(OtherReg))
return false;
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 5c49e6eff0bf..20865037fe38 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -154,7 +154,7 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT,
getEffectiveRelocModel(RM),
getEffectiveSystemZCodeModel(CM, getEffectiveRelocModel(RM), JIT),
OL),
- TLOF(llvm::make_unique<TargetLoweringObjectFileELF>()),
+ TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
Subtarget(TT, CPU, FS, *this) {
initAsmInfo();
}
@@ -176,7 +176,7 @@ public:
ScheduleDAGInstrs *
createPostMachineScheduler(MachineSchedContext *C) const override {
return new ScheduleDAGMI(C,
- llvm::make_unique<SystemZPostRASchedStrategy>(C),
+ std::make_unique<SystemZPostRASchedStrategy>(C),
/*RemoveKillFlags=*/true);
}
@@ -184,6 +184,7 @@ public:
bool addInstSelector() override;
bool addILPOpts() override;
void addPostRewrite() override;
+ void addPostRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
};
@@ -217,14 +218,14 @@ void SystemZPassConfig::addPostRewrite() {
addPass(createSystemZPostRewritePass(getSystemZTargetMachine()));
}
-void SystemZPassConfig::addPreSched2() {
+void SystemZPassConfig::addPostRegAlloc() {
// PostRewrite needs to be run at -O0 also (in which case addPostRewrite()
// is not called).
if (getOptLevel() == CodeGenOpt::None)
addPass(createSystemZPostRewritePass(getSystemZTargetMachine()));
+}
- addPass(createSystemZExpandPseudoPass(getSystemZTargetMachine()));
-
+void SystemZPassConfig::addPreSched2() {
if (getOptLevel() != CodeGenOpt::None)
addPass(&IfConverterID);
}
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 145cf87ef9f5..11c99aa11174 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -304,7 +304,8 @@ bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
C2.ScaleCost, C2.SetupCost);
}
-unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) {
+unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (!Vector)
// Discount the stack pointer. Also leave out %r0, since it can't
// be used in an address.
@@ -707,7 +708,7 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
// TODO: Fix base implementation which could simplify things a bit here
// (seems to miss on differentiating on scalar/vector types).
- // Only 64 bit vector conversions are natively supported before arch13.
+ // Only 64 bit vector conversions are natively supported before z15.
if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
if (SrcScalarBits == DstScalarBits)
return NumDstVectors;
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 16ce2ef1d7a0..3ba80b31439f 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -56,12 +56,12 @@ public:
/// \name Vector TTI Implementations
/// @{
- unsigned getNumberOfRegisters(bool Vector);
+ unsigned getNumberOfRegisters(unsigned ClassID) const;
unsigned getRegisterBitWidth(bool Vector) const;
- unsigned getCacheLineSize() { return 256; }
- unsigned getPrefetchDistance() { return 2000; }
- unsigned getMinPrefetchStride() { return 2048; }
+ unsigned getCacheLineSize() const override { return 256; }
+ unsigned getPrefetchDistance() const override { return 2000; }
+ unsigned getMinPrefetchStride() const override { return 2048; }
bool hasDivRemOp(Type *DataType, bool IsSigned);
bool prefersVectorizedAddressing() { return false; }
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 17274e1c2c6e..dcd3934de0fa 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -253,6 +253,7 @@ MCSection *TargetLoweringObjectFile::SectionForGlobal(
auto Attrs = GVar->getAttributes();
if ((Attrs.hasAttribute("bss-section") && Kind.isBSS()) ||
(Attrs.hasAttribute("data-section") && Kind.isData()) ||
+ (Attrs.hasAttribute("relro-section") && Kind.isReadOnlyWithRel()) ||
(Attrs.hasAttribute("rodata-section") && Kind.isReadOnly())) {
return getExplicitSectionGlobal(GO, Kind, TM);
}
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 634866d93570..4c98e140f446 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -63,18 +63,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math");
- RESET_OPTION(NoTrappingFPMath, "no-trapping-math");
-
- StringRef Denormal =
- F.getFnAttribute("denormal-fp-math").getValueAsString();
- if (Denormal == "ieee")
- Options.FPDenormalMode = FPDenormal::IEEE;
- else if (Denormal == "preserve-sign")
- Options.FPDenormalMode = FPDenormal::PreserveSign;
- else if (Denormal == "positive-zero")
- Options.FPDenormalMode = FPDenormal::PositiveZero;
- else
- Options.FPDenormalMode = DefaultOptions.FPDenormalMode;
}
/// Returns the code generation relocation model. The choices are static, PIC,
@@ -140,8 +128,8 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
// don't assume the variables to be DSO local unless we actually know
// that for sure. This only has to be done for variables; for functions
// the linker can insert thunks for calling functions from another DLL.
- if (TT.isWindowsGNUEnvironment() && GV && GV->isDeclarationForLinker() &&
- isa<GlobalVariable>(GV))
+ if (TT.isWindowsGNUEnvironment() && TT.isOSBinFormatCOFF() && GV &&
+ GV->isDeclarationForLinker() && isa<GlobalVariable>(GV))
return false;
// On COFF, don't mark 'extern_weak' symbols as DSO local. If these symbols
@@ -154,7 +142,9 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
// Make an exception for windows OS in the triple: Some firmware builds use
// *-win32-macho triples. This (accidentally?) produced windows relocations
// without GOT tables in older clang versions; Keep this behaviour.
- if (TT.isOSBinFormatCOFF() || (TT.isOSWindows() && TT.isOSBinFormatMachO()))
+ // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables
+ // either.
+ if (TT.isOSBinFormatCOFF() || TT.isOSWindows())
return true;
// Most PIC code sequences that assume that a symbol is local cannot
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 5d9029682fdd..3ac9c38dfc0b 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -219,7 +219,7 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
std::error_code EC;
- raw_fd_ostream dest(Filename, EC, sys::fs::F_None);
+ raw_fd_ostream dest(Filename, EC, sys::fs::OF_None);
if (EC) {
*ErrorMessage = strdup(EC.message().c_str());
return true;
diff --git a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 09628e872dd5..53a96fd6a97d 100644
--- a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -313,16 +313,17 @@ public:
return Optional<wasm::ValType>();
}
- WebAssembly::ExprType parseBlockType(StringRef ID) {
- return StringSwitch<WebAssembly::ExprType>(ID)
- .Case("i32", WebAssembly::ExprType::I32)
- .Case("i64", WebAssembly::ExprType::I64)
- .Case("f32", WebAssembly::ExprType::F32)
- .Case("f64", WebAssembly::ExprType::F64)
- .Case("v128", WebAssembly::ExprType::V128)
- .Case("exnref", WebAssembly::ExprType::Exnref)
- .Case("void", WebAssembly::ExprType::Void)
- .Default(WebAssembly::ExprType::Invalid);
+ WebAssembly::BlockType parseBlockType(StringRef ID) {
+ // Multivalue block types are handled separately in parseSignature
+ return StringSwitch<WebAssembly::BlockType>(ID)
+ .Case("i32", WebAssembly::BlockType::I32)
+ .Case("i64", WebAssembly::BlockType::I64)
+ .Case("f32", WebAssembly::BlockType::F32)
+ .Case("f64", WebAssembly::BlockType::F64)
+ .Case("v128", WebAssembly::BlockType::V128)
+ .Case("exnref", WebAssembly::BlockType::Exnref)
+ .Case("void", WebAssembly::BlockType::Void)
+ .Default(WebAssembly::BlockType::Invalid);
}
bool parseRegTypeList(SmallVectorImpl<wasm::ValType> &Types) {
@@ -343,7 +344,7 @@ public:
int64_t Val = Int.getIntVal();
if (IsNegative)
Val = -Val;
- Operands.push_back(make_unique<WebAssemblyOperand>(
+ Operands.push_back(std::make_unique<WebAssemblyOperand>(
WebAssemblyOperand::Integer, Int.getLoc(), Int.getEndLoc(),
WebAssemblyOperand::IntOp{Val}));
Parser.Lex();
@@ -356,7 +357,7 @@ public:
return error("Cannot parse real: ", Flt);
if (IsNegative)
Val = -Val;
- Operands.push_back(make_unique<WebAssemblyOperand>(
+ Operands.push_back(std::make_unique<WebAssemblyOperand>(
WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(),
WebAssemblyOperand::FltOp{Val}));
Parser.Lex();
@@ -378,7 +379,7 @@ public:
}
if (IsNegative)
Val = -Val;
- Operands.push_back(make_unique<WebAssemblyOperand>(
+ Operands.push_back(std::make_unique<WebAssemblyOperand>(
WebAssemblyOperand::Float, Flt.getLoc(), Flt.getEndLoc(),
WebAssemblyOperand::FltOp{Val}));
Parser.Lex();
@@ -407,7 +408,7 @@ public:
// an opcode until after the assembly matcher, so set a default to fix
// up later.
auto Tok = Lexer.getTok();
- Operands.push_back(make_unique<WebAssemblyOperand>(
+ Operands.push_back(std::make_unique<WebAssemblyOperand>(
WebAssemblyOperand::Integer, Tok.getLoc(), Tok.getEndLoc(),
WebAssemblyOperand::IntOp{-1}));
}
@@ -416,8 +417,8 @@ public:
}
void addBlockTypeOperand(OperandVector &Operands, SMLoc NameLoc,
- WebAssembly::ExprType BT) {
- Operands.push_back(make_unique<WebAssemblyOperand>(
+ WebAssembly::BlockType BT) {
+ Operands.push_back(std::make_unique<WebAssemblyOperand>(
WebAssemblyOperand::Integer, NameLoc, NameLoc,
WebAssemblyOperand::IntOp{static_cast<int64_t>(BT)}));
}
@@ -449,13 +450,14 @@ public:
}
// Now construct the name as first operand.
- Operands.push_back(make_unique<WebAssemblyOperand>(
+ Operands.push_back(std::make_unique<WebAssemblyOperand>(
WebAssemblyOperand::Token, NameLoc, SMLoc::getFromPointer(Name.end()),
WebAssemblyOperand::TokOp{Name}));
// If this instruction is part of a control flow structure, ensure
// proper nesting.
bool ExpectBlockType = false;
+ bool ExpectFuncType = false;
if (Name == "block") {
push(Block);
ExpectBlockType = true;
@@ -489,9 +491,37 @@ public:
if (pop(Name, Block))
return true;
} else if (Name == "end_function") {
+ ensureLocals(getStreamer());
CurrentState = EndFunction;
if (pop(Name, Function) || ensureEmptyNestingStack())
return true;
+ } else if (Name == "call_indirect" || Name == "return_call_indirect") {
+ ExpectFuncType = true;
+ }
+
+ if (ExpectFuncType || (ExpectBlockType && Lexer.is(AsmToken::LParen))) {
+ // This has a special TYPEINDEX operand which in text we
+ // represent as a signature, such that we can re-build this signature,
+ // attach it to an anonymous symbol, which is what WasmObjectWriter
+ // expects to be able to recreate the actual unique-ified type indices.
+ auto Loc = Parser.getTok();
+ auto Signature = std::make_unique<wasm::WasmSignature>();
+ if (parseSignature(Signature.get()))
+ return true;
+ // Got signature as block type, don't need more
+ ExpectBlockType = false;
+ auto &Ctx = getStreamer().getContext();
+ // The "true" here will cause this to be a nameless symbol.
+ MCSymbol *Sym = Ctx.createTempSymbol("typeindex", true);
+ auto *WasmSym = cast<MCSymbolWasm>(Sym);
+ WasmSym->setSignature(Signature.get());
+ addSignature(std::move(Signature));
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+ const MCExpr *Expr = MCSymbolRefExpr::create(
+ WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx);
+ Operands.push_back(std::make_unique<WebAssemblyOperand>(
+ WebAssemblyOperand::Symbol, Loc.getLoc(), Loc.getEndLoc(),
+ WebAssemblyOperand::SymOp{Expr}));
}
while (Lexer.isNot(AsmToken::EndOfStatement)) {
@@ -504,7 +534,7 @@ public:
if (ExpectBlockType) {
// Assume this identifier is a block_type.
auto BT = parseBlockType(Id.getString());
- if (BT == WebAssembly::ExprType::Invalid)
+ if (BT == WebAssembly::BlockType::Invalid)
return error("Unknown block type: ", Id);
addBlockTypeOperand(Operands, NameLoc, BT);
Parser.Lex();
@@ -514,7 +544,7 @@ public:
SMLoc End;
if (Parser.parseExpression(Val, End))
return error("Cannot parse symbol: ", Lexer.getTok());
- Operands.push_back(make_unique<WebAssemblyOperand>(
+ Operands.push_back(std::make_unique<WebAssemblyOperand>(
WebAssemblyOperand::Symbol, Id.getLoc(), Id.getEndLoc(),
WebAssemblyOperand::SymOp{Val}));
if (checkForP2AlignIfLoadStore(Operands, Name))
@@ -549,7 +579,7 @@ public:
}
case AsmToken::LCurly: {
Parser.Lex();
- auto Op = make_unique<WebAssemblyOperand>(
+ auto Op = std::make_unique<WebAssemblyOperand>(
WebAssemblyOperand::BrList, Tok.getLoc(), Tok.getEndLoc());
if (!Lexer.is(AsmToken::RCurly))
for (;;) {
@@ -572,7 +602,7 @@ public:
}
if (ExpectBlockType && Operands.size() == 1) {
// Support blocks with no operands as default to void.
- addBlockTypeOperand(Operands, NameLoc, WebAssembly::ExprType::Void);
+ addBlockTypeOperand(Operands, NameLoc, WebAssembly::BlockType::Void);
}
Parser.Lex();
return false;
@@ -671,7 +701,7 @@ public:
LastFunctionLabel = LastLabel;
push(Function);
}
- auto Signature = make_unique<wasm::WasmSignature>();
+ auto Signature = std::make_unique<wasm::WasmSignature>();
if (parseSignature(Signature.get()))
return true;
WasmSym->setSignature(Signature.get());
@@ -687,7 +717,7 @@ public:
if (SymName.empty())
return true;
auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
- auto Signature = make_unique<wasm::WasmSignature>();
+ auto Signature = std::make_unique<wasm::WasmSignature>();
if (parseRegTypeList(Signature->Params))
return true;
WasmSym->setSignature(Signature.get());
@@ -737,24 +767,30 @@ public:
return true; // We didn't process this directive.
}
+ // Called either when the first instruction is parsed of the function ends.
+ void ensureLocals(MCStreamer &Out) {
+ if (CurrentState == FunctionStart) {
+ // We haven't seen a .local directive yet. The streamer requires locals to
+ // be encoded as a prelude to the instructions, so emit an empty list of
+ // locals here.
+ auto &TOut = reinterpret_cast<WebAssemblyTargetStreamer &>(
+ *Out.getTargetStreamer());
+ TOut.emitLocal(SmallVector<wasm::ValType, 0>());
+ CurrentState = FunctionLocals;
+ }
+ }
+
bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned & /*Opcode*/,
OperandVector &Operands, MCStreamer &Out,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) override {
MCInst Inst;
+ Inst.setLoc(IDLoc);
unsigned MatchResult =
MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
switch (MatchResult) {
case Match_Success: {
- if (CurrentState == FunctionStart) {
- // This is the first instruction in a function, but we haven't seen
- // a .local directive yet. The streamer requires locals to be encoded
- // as a prelude to the instructions, so emit an empty list of locals
- // here.
- auto &TOut = reinterpret_cast<WebAssemblyTargetStreamer &>(
- *Out.getTargetStreamer());
- TOut.emitLocal(SmallVector<wasm::ValType, 0>());
- }
+ ensureLocals(Out);
// Fix unknown p2align operands.
auto Align = WebAssembly::GetDefaultP2AlignAny(Inst.getOpcode());
if (Align != -1U) {
diff --git a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index f9bf3f85d30f..9a9c31cff2d5 100644
--- a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -24,6 +24,7 @@
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/TargetRegistry.h"
@@ -213,10 +214,29 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
return MCDisassembler::Fail;
break;
}
- // block_type operands (uint8_t).
+ // block_type operands:
case WebAssembly::OPERAND_SIGNATURE: {
- if (!parseImmediate<uint8_t>(MI, Size, Bytes))
+ int64_t Val;
+ uint64_t PrevSize = Size;
+ if (!nextLEB(Val, Bytes, Size, true))
return MCDisassembler::Fail;
+ if (Val < 0) {
+ // Negative values are single septet value types or empty types
+ if (Size != PrevSize + 1) {
+ MI.addOperand(
+ MCOperand::createImm(int64_t(WebAssembly::BlockType::Invalid)));
+ } else {
+ MI.addOperand(MCOperand::createImm(Val & 0x7f));
+ }
+ } else {
+ // We don't have access to the signature, so create a symbol without one
+ MCSymbol *Sym = getContext().createTempSymbol("typeindex", true);
+ auto *WasmSym = cast<MCSymbolWasm>(Sym);
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+ const MCExpr *Expr = MCSymbolRefExpr::create(
+ WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, getContext());
+ MI.addOperand(MCOperand::createExpr(Expr));
+ }
break;
}
// FP operands.
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 70b409cf4a90..8314de41021f 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -31,10 +31,12 @@ namespace {
class WebAssemblyAsmBackend final : public MCAsmBackend {
bool Is64Bit;
+ bool IsEmscripten;
public:
- explicit WebAssemblyAsmBackend(bool Is64Bit)
- : MCAsmBackend(support::little), Is64Bit(Is64Bit) {}
+ explicit WebAssemblyAsmBackend(bool Is64Bit, bool IsEmscripten)
+ : MCAsmBackend(support::little), Is64Bit(Is64Bit),
+ IsEmscripten(IsEmscripten) {}
unsigned getNumFixupKinds() const override {
return WebAssembly::NumTargetFixupKinds;
@@ -123,11 +125,11 @@ void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm,
std::unique_ptr<MCObjectTargetWriter>
WebAssemblyAsmBackend::createObjectTargetWriter() const {
- return createWebAssemblyWasmObjectWriter(Is64Bit);
+ return createWebAssemblyWasmObjectWriter(Is64Bit, IsEmscripten);
}
} // end anonymous namespace
MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) {
- return new WebAssemblyAsmBackend(TT.isArch64Bit());
+ return new WebAssemblyAsmBackend(TT.isArch64Bit(), TT.isOSEmscripten());
}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index b5d4d369b726..221ac17b8336 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -15,6 +15,7 @@
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
#include "WebAssembly.h"
#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblyUtilities.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -51,7 +52,9 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
// Print any additional variadic operands.
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- if (Desc.isVariadic())
+ if (Desc.isVariadic()) {
+ if (Desc.getNumOperands() == 0 && MI->getNumOperands() > 0)
+ OS << "\t";
for (auto I = Desc.getNumOperands(), E = MI->getNumOperands(); I < E; ++I) {
// FIXME: For CALL_INDIRECT_VOID, don't print a leading comma, because
// we have an extra flags operand which is not currently printed, for
@@ -62,6 +65,7 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
OS << ", ";
printOperand(MI, I, OS);
}
+ }
// Print any added annotation.
printAnnotation(OS, Annot);
@@ -232,7 +236,16 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
}
} else {
assert(Op.isExpr() && "unknown operand kind in printOperand");
- Op.getExpr()->print(O, &MAI);
+ // call_indirect instructions have a TYPEINDEX operand that we print
+ // as a signature here, such that the assembler can recover this
+ // information.
+ auto SRE = static_cast<const MCSymbolRefExpr *>(Op.getExpr());
+ if (SRE->getKind() == MCSymbolRefExpr::VK_WASM_TYPEINDEX) {
+ auto &Sym = static_cast<const MCSymbolWasm &>(SRE->getSymbol());
+ O << WebAssembly::signatureToString(Sym.getSignature());
+ } else {
+ Op.getExpr()->print(O, &MAI);
+ }
}
}
@@ -259,14 +272,26 @@ void WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI,
void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
unsigned OpNo,
raw_ostream &O) {
- auto Imm = static_cast<unsigned>(MI->getOperand(OpNo).getImm());
- if (Imm != wasm::WASM_TYPE_NORESULT)
- O << WebAssembly::anyTypeToString(Imm);
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm()) {
+ auto Imm = static_cast<unsigned>(Op.getImm());
+ if (Imm != wasm::WASM_TYPE_NORESULT)
+ O << WebAssembly::anyTypeToString(Imm);
+ } else {
+ auto Expr = cast<MCSymbolRefExpr>(Op.getExpr());
+ auto *Sym = cast<MCSymbolWasm>(&Expr->getSymbol());
+ if (Sym->getSignature()) {
+ O << WebAssembly::signatureToString(Sym->getSignature());
+ } else {
+ // Disassembler does not currently produce a signature
+ O << "unknown_type";
+ }
+ }
}
// We have various enums representing a subset of these types, use this
// function to convert any of them to text.
-const char *llvm::WebAssembly::anyTypeToString(unsigned Ty) {
+const char *WebAssembly::anyTypeToString(unsigned Ty) {
switch (Ty) {
case wasm::WASM_TYPE_I32:
return "i32";
@@ -291,6 +316,24 @@ const char *llvm::WebAssembly::anyTypeToString(unsigned Ty) {
}
}
-const char *llvm::WebAssembly::typeToString(wasm::ValType Ty) {
+const char *WebAssembly::typeToString(wasm::ValType Ty) {
return anyTypeToString(static_cast<unsigned>(Ty));
}
+
+std::string WebAssembly::typeListToString(ArrayRef<wasm::ValType> List) {
+ std::string S;
+ for (auto &Ty : List) {
+ if (&Ty != &List[0]) S += ", ";
+ S += WebAssembly::typeToString(Ty);
+ }
+ return S;
+}
+
+std::string WebAssembly::signatureToString(const wasm::WasmSignature *Sig) {
+ std::string S("(");
+ S += typeListToString(Sig->Params);
+ S += ") -> (";
+ S += typeListToString(Sig->Returns);
+ S += ")";
+ return S;
+}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
index b979de5028bf..cf37778099a0 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
@@ -58,6 +58,9 @@ namespace WebAssembly {
const char *typeToString(wasm::ValType Ty);
const char *anyTypeToString(unsigned Ty);
+std::string typeListToString(ArrayRef<wasm::ValType> List);
+std::string signatureToString(const wasm::WasmSignature *Sig);
+
} // end namespace WebAssembly
} // end namespace llvm
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index 44b6d6a968a9..1a4c57e66d2f 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -152,6 +152,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
break;
case WebAssembly::OPERAND_FUNCTION32:
case WebAssembly::OPERAND_OFFSET32:
+ case WebAssembly::OPERAND_SIGNATURE:
case WebAssembly::OPERAND_TYPEINDEX:
case WebAssembly::OPERAND_GLOBAL:
case WebAssembly::OPERAND_EVENT:
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 7a9f59b1a4f2..b339860a381d 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -38,7 +38,7 @@ MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
std::unique_ptr<MCObjectTargetWriter>
-createWebAssemblyWasmObjectWriter(bool Is64Bit);
+createWebAssemblyWasmObjectWriter(bool Is64Bit, bool IsEmscripten);
namespace WebAssembly {
enum OperandType {
@@ -122,16 +122,22 @@ enum TOF {
namespace llvm {
namespace WebAssembly {
-/// This is used to indicate block signatures.
-enum class ExprType : unsigned {
+/// Used as immediate MachineOperands for block signatures
+enum class BlockType : unsigned {
+ Invalid = 0x00,
Void = 0x40,
- I32 = 0x7F,
- I64 = 0x7E,
- F32 = 0x7D,
- F64 = 0x7C,
- V128 = 0x7B,
- Exnref = 0x68,
- Invalid = 0x00
+ I32 = unsigned(wasm::ValType::I32),
+ I64 = unsigned(wasm::ValType::I64),
+ F32 = unsigned(wasm::ValType::F32),
+ F64 = unsigned(wasm::ValType::F64),
+ V128 = unsigned(wasm::ValType::V128),
+ Exnref = unsigned(wasm::ValType::EXNREF),
+ // Multivalue blocks (and other non-void blocks) are only emitted when the
+ // blocks will never be exited and are at the ends of functions (see
+ // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made
+ // to pop values off the stack, so the exact multivalue signature can always
+ // be inferred from the return type of the parent function in MCInstLower.
+ Multivalue = 0xffff,
};
/// Instruction opcodes emitted via means other than CodeGen.
@@ -191,6 +197,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I32_S:
case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I64:
case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I64_S:
+ case WebAssembly::LOAD_SPLAT_v8x16:
+ case WebAssembly::LOAD_SPLAT_v8x16_S:
return 0;
case WebAssembly::LOAD16_S_I32:
case WebAssembly::LOAD16_S_I32_S:
@@ -240,6 +248,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I32_S:
case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I64:
case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I64_S:
+ case WebAssembly::LOAD_SPLAT_v16x8:
+ case WebAssembly::LOAD_SPLAT_v16x8_S:
return 1;
case WebAssembly::LOAD_I32:
case WebAssembly::LOAD_I32_S:
@@ -295,6 +305,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
case WebAssembly::ATOMIC_NOTIFY_S:
case WebAssembly::ATOMIC_WAIT_I32:
case WebAssembly::ATOMIC_WAIT_I32_S:
+ case WebAssembly::LOAD_SPLAT_v32x4:
+ case WebAssembly::LOAD_SPLAT_v32x4_S:
return 2;
case WebAssembly::LOAD_I64:
case WebAssembly::LOAD_I64_S:
@@ -324,31 +336,25 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
case WebAssembly::ATOMIC_RMW_CMPXCHG_I64_S:
case WebAssembly::ATOMIC_WAIT_I64:
case WebAssembly::ATOMIC_WAIT_I64_S:
+ case WebAssembly::LOAD_SPLAT_v64x2:
+ case WebAssembly::LOAD_SPLAT_v64x2_S:
+ case WebAssembly::LOAD_EXTEND_S_v8i16:
+ case WebAssembly::LOAD_EXTEND_S_v8i16_S:
+ case WebAssembly::LOAD_EXTEND_U_v8i16:
+ case WebAssembly::LOAD_EXTEND_U_v8i16_S:
+ case WebAssembly::LOAD_EXTEND_S_v4i32:
+ case WebAssembly::LOAD_EXTEND_S_v4i32_S:
+ case WebAssembly::LOAD_EXTEND_U_v4i32:
+ case WebAssembly::LOAD_EXTEND_U_v4i32_S:
+ case WebAssembly::LOAD_EXTEND_S_v2i64:
+ case WebAssembly::LOAD_EXTEND_S_v2i64_S:
+ case WebAssembly::LOAD_EXTEND_U_v2i64:
+ case WebAssembly::LOAD_EXTEND_U_v2i64_S:
return 3;
- case WebAssembly::LOAD_v16i8:
- case WebAssembly::LOAD_v16i8_S:
- case WebAssembly::LOAD_v8i16:
- case WebAssembly::LOAD_v8i16_S:
- case WebAssembly::LOAD_v4i32:
- case WebAssembly::LOAD_v4i32_S:
- case WebAssembly::LOAD_v2i64:
- case WebAssembly::LOAD_v2i64_S:
- case WebAssembly::LOAD_v4f32:
- case WebAssembly::LOAD_v4f32_S:
- case WebAssembly::LOAD_v2f64:
- case WebAssembly::LOAD_v2f64_S:
- case WebAssembly::STORE_v16i8:
- case WebAssembly::STORE_v16i8_S:
- case WebAssembly::STORE_v8i16:
- case WebAssembly::STORE_v8i16_S:
- case WebAssembly::STORE_v4i32:
- case WebAssembly::STORE_v4i32_S:
- case WebAssembly::STORE_v2i64:
- case WebAssembly::STORE_v2i64_S:
- case WebAssembly::STORE_v4f32:
- case WebAssembly::STORE_v4f32_S:
- case WebAssembly::STORE_v2f64:
- case WebAssembly::STORE_v2f64_S:
+ case WebAssembly::LOAD_V128:
+ case WebAssembly::LOAD_V128_S:
+ case WebAssembly::STORE_V128:
+ case WebAssembly::STORE_V128_S:
return 4;
default:
return -1;
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index e05efef7201b..40926201931a 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -60,39 +60,10 @@ void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<wasm::ValType> Types) {
void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
-void WebAssemblyTargetAsmStreamer::emitSignature(
- const wasm::WasmSignature *Sig) {
- OS << "(";
- emitParamList(Sig);
- OS << ") -> (";
- emitReturnList(Sig);
- OS << ")";
-}
-
-void WebAssemblyTargetAsmStreamer::emitParamList(
- const wasm::WasmSignature *Sig) {
- auto &Params = Sig->Params;
- for (auto &Ty : Params) {
- if (&Ty != &Params[0])
- OS << ", ";
- OS << WebAssembly::typeToString(Ty);
- }
-}
-
-void WebAssemblyTargetAsmStreamer::emitReturnList(
- const wasm::WasmSignature *Sig) {
- auto &Returns = Sig->Returns;
- for (auto &Ty : Returns) {
- if (&Ty != &Returns[0])
- OS << ", ";
- OS << WebAssembly::typeToString(Ty);
- }
-}
-
void WebAssemblyTargetAsmStreamer::emitFunctionType(const MCSymbolWasm *Sym) {
assert(Sym->isFunction());
OS << "\t.functype\t" << Sym->getName() << " ";
- emitSignature(Sym->getSignature());
+ OS << WebAssembly::signatureToString(Sym->getSignature());
OS << "\n";
}
@@ -107,7 +78,7 @@ void WebAssemblyTargetAsmStreamer::emitGlobalType(const MCSymbolWasm *Sym) {
void WebAssemblyTargetAsmStreamer::emitEventType(const MCSymbolWasm *Sym) {
assert(Sym->isEvent());
OS << "\t.eventtype\t" << Sym->getName() << " ";
- emitParamList(Sym->getSignature());
+ OS << WebAssembly::typeListToString(Sym->getSignature()->Params);
OS << "\n";
}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index 5ea62b179d22..0164f8e572ef 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -56,9 +56,6 @@ protected:
/// This part is for ascii assembly output
class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer {
formatted_raw_ostream &OS;
- void emitSignature(const wasm::WasmSignature *Sig);
- void emitParamList(const wasm::WasmSignature *Sig);
- void emitReturnList(const wasm::WasmSignature *Sig);
public:
WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index a1cc3e268e8f..e7a599e3e175 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
namespace {
class WebAssemblyWasmObjectWriter final : public MCWasmObjectTargetWriter {
public:
- explicit WebAssemblyWasmObjectWriter(bool Is64Bit);
+ explicit WebAssemblyWasmObjectWriter(bool Is64Bit, bool IsEmscripten);
private:
unsigned getRelocType(const MCValue &Target,
@@ -39,8 +39,9 @@ private:
};
} // end anonymous namespace
-WebAssemblyWasmObjectWriter::WebAssemblyWasmObjectWriter(bool Is64Bit)
- : MCWasmObjectTargetWriter(Is64Bit) {}
+WebAssemblyWasmObjectWriter::WebAssemblyWasmObjectWriter(bool Is64Bit,
+ bool IsEmscripten)
+ : MCWasmObjectTargetWriter(Is64Bit, IsEmscripten) {}
static const MCSection *getFixupSection(const MCExpr *Expr) {
if (auto SyExp = dyn_cast<MCSymbolRefExpr>(Expr)) {
@@ -116,6 +117,6 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
}
std::unique_ptr<MCObjectTargetWriter>
-llvm::createWebAssemblyWasmObjectWriter(bool Is64Bit) {
- return llvm::make_unique<WebAssemblyWasmObjectWriter>(Is64Bit);
+llvm::createWebAssemblyWasmObjectWriter(bool Is64Bit, bool IsEmscripten) {
+ return std::make_unique<WebAssemblyWasmObjectWriter>(Is64Bit, IsEmscripten);
}
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 7f9d41da3978..5d8b873ce23b 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -67,8 +67,8 @@ MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
}
std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) {
- unsigned RegNo = MO.getReg();
- assert(TargetRegisterInfo::isVirtualRegister(RegNo) &&
+ Register RegNo = MO.getReg();
+ assert(Register::isVirtualRegister(RegNo) &&
"Unlowered physical register encountered during assembly printing");
assert(!MFI->isVRegStackified(RegNo));
unsigned WAReg = MFI->getWAReg(RegNo);
@@ -332,43 +332,15 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// These represent values which are live into the function entry, so there's
// no instruction to emit.
break;
- case WebAssembly::FALLTHROUGH_RETURN_I32:
- case WebAssembly::FALLTHROUGH_RETURN_I32_S:
- case WebAssembly::FALLTHROUGH_RETURN_I64:
- case WebAssembly::FALLTHROUGH_RETURN_I64_S:
- case WebAssembly::FALLTHROUGH_RETURN_F32:
- case WebAssembly::FALLTHROUGH_RETURN_F32_S:
- case WebAssembly::FALLTHROUGH_RETURN_F64:
- case WebAssembly::FALLTHROUGH_RETURN_F64_S:
- case WebAssembly::FALLTHROUGH_RETURN_v16i8:
- case WebAssembly::FALLTHROUGH_RETURN_v16i8_S:
- case WebAssembly::FALLTHROUGH_RETURN_v8i16:
- case WebAssembly::FALLTHROUGH_RETURN_v8i16_S:
- case WebAssembly::FALLTHROUGH_RETURN_v4i32:
- case WebAssembly::FALLTHROUGH_RETURN_v4i32_S:
- case WebAssembly::FALLTHROUGH_RETURN_v2i64:
- case WebAssembly::FALLTHROUGH_RETURN_v2i64_S:
- case WebAssembly::FALLTHROUGH_RETURN_v4f32:
- case WebAssembly::FALLTHROUGH_RETURN_v4f32_S:
- case WebAssembly::FALLTHROUGH_RETURN_v2f64:
- case WebAssembly::FALLTHROUGH_RETURN_v2f64_S: {
+ case WebAssembly::FALLTHROUGH_RETURN: {
// These instructions represent the implicit return at the end of a
- // function body. Always pops one value off the stack.
+ // function body.
if (isVerbose()) {
- OutStreamer->AddComment("fallthrough-return-value");
+ OutStreamer->AddComment("fallthrough-return");
OutStreamer->AddBlankLine();
}
break;
}
- case WebAssembly::FALLTHROUGH_RETURN_VOID:
- case WebAssembly::FALLTHROUGH_RETURN_VOID_S:
- // This instruction represents the implicit return at the end of a
- // function body with no return value.
- if (isVerbose()) {
- OutStreamer->AddComment("fallthrough-return-void");
- OutStreamer->AddBlankLine();
- }
- break;
case WebAssembly::COMPILER_FENCE:
// This is a compiler barrier that prevents instruction reordering during
// backend compilation, and should not be emitted.
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index 4c5d0192fc28..c069af9eed62 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -97,14 +97,14 @@ public:
// If the smallest region containing MBB is a loop
if (LoopMap.count(ML))
return LoopMap[ML].get();
- LoopMap[ML] = llvm::make_unique<ConcreteRegion<MachineLoop>>(ML);
+ LoopMap[ML] = std::make_unique<ConcreteRegion<MachineLoop>>(ML);
return LoopMap[ML].get();
} else {
// If the smallest region containing MBB is an exception
if (ExceptionMap.count(WE))
return ExceptionMap[WE].get();
ExceptionMap[WE] =
- llvm::make_unique<ConcreteRegion<WebAssemblyException>>(WE);
+ std::make_unique<ConcreteRegion<WebAssemblyException>>(WE);
return ExceptionMap[WE].get();
}
}
@@ -317,6 +317,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
// If Next was originally ordered before MBB, and it isn't because it was
// loop-rotated above the header, it's not preferred.
if (Next->getNumber() < MBB->getNumber() &&
+ (WasmDisableEHPadSort || !Next->isEHPad()) &&
(!R || !R->contains(Next) ||
R->getHeader()->getNumber() < Next->getNumber())) {
Ready.push(Next);
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index e6bfc5226e2e..7e867edaaa27 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -29,6 +29,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/MC/MCAsmInfo.h"
using namespace llvm;
@@ -315,12 +316,12 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
// br_on_exn 0, $__cpp_exception
// rethrow
// end_block
- WebAssembly::ExprType ReturnType = WebAssembly::ExprType::Void;
+ WebAssembly::BlockType ReturnType = WebAssembly::BlockType::Void;
if (IsBrOnExn) {
const char *TagName = BrOnExn->getOperand(1).getSymbolName();
if (std::strcmp(TagName, "__cpp_exception") != 0)
llvm_unreachable("Only C++ exception is supported");
- ReturnType = WebAssembly::ExprType::I32;
+ ReturnType = WebAssembly::BlockType::I32;
}
auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet);
@@ -406,7 +407,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
auto InsertPos = getEarliestInsertPos(&MBB, BeforeSet, AfterSet);
MachineInstr *Begin = BuildMI(MBB, InsertPos, MBB.findDebugLoc(InsertPos),
TII.get(WebAssembly::LOOP))
- .addImm(int64_t(WebAssembly::ExprType::Void));
+ .addImm(int64_t(WebAssembly::BlockType::Void));
// Decide where in Header to put the END_LOOP.
BeforeSet.clear();
@@ -526,46 +527,56 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
AfterSet.insert(&MI);
}
- // Local expression tree should go after the TRY.
- for (auto I = Header->getFirstTerminator(), E = Header->begin(); I != E;
- --I) {
- if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition())
- continue;
- if (WebAssembly::isChild(*std::prev(I), MFI))
- AfterSet.insert(&*std::prev(I));
- else
- break;
- }
-
// If Header unwinds to MBB (= Header contains 'invoke'), the try block should
// contain the call within it. So the call should go after the TRY. The
// exception is when the header's terminator is a rethrow instruction, in
// which case that instruction, not a call instruction before it, is gonna
// throw.
+ MachineInstr *ThrowingCall = nullptr;
if (MBB.isPredecessor(Header)) {
auto TermPos = Header->getFirstTerminator();
if (TermPos == Header->end() ||
TermPos->getOpcode() != WebAssembly::RETHROW) {
- for (const auto &MI : reverse(*Header)) {
+ for (auto &MI : reverse(*Header)) {
if (MI.isCall()) {
AfterSet.insert(&MI);
+ ThrowingCall = &MI;
// Possibly throwing calls are usually wrapped by EH_LABEL
// instructions. We don't want to split them and the call.
if (MI.getIterator() != Header->begin() &&
- std::prev(MI.getIterator())->isEHLabel())
+ std::prev(MI.getIterator())->isEHLabel()) {
AfterSet.insert(&*std::prev(MI.getIterator()));
+ ThrowingCall = &*std::prev(MI.getIterator());
+ }
break;
}
}
}
}
+ // Local expression tree should go after the TRY.
+ // For BLOCK placement, we start the search from the previous instruction of a
+ // BB's terminator, but in TRY's case, we should start from the previous
+ // instruction of a call that can throw, or a EH_LABEL that precedes the call,
+ // because the return values of the call's previous instructions can be
+ // stackified and consumed by the throwing call.
+ auto SearchStartPt = ThrowingCall ? MachineBasicBlock::iterator(ThrowingCall)
+ : Header->getFirstTerminator();
+ for (auto I = SearchStartPt, E = Header->begin(); I != E; --I) {
+ if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition())
+ continue;
+ if (WebAssembly::isChild(*std::prev(I), MFI))
+ AfterSet.insert(&*std::prev(I));
+ else
+ break;
+ }
+
// Add the TRY.
auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet);
MachineInstr *Begin =
BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos),
TII.get(WebAssembly::TRY))
- .addImm(int64_t(WebAssembly::ExprType::Void));
+ .addImm(int64_t(WebAssembly::BlockType::Void));
// Decide where in Header to put the END_TRY.
BeforeSet.clear();
@@ -694,8 +705,26 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
}
}
+// When MBB is split into MBB and Split, we should unstackify defs in MBB that
+// have their uses in Split.
+static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
+ MachineBasicBlock &Split,
+ WebAssemblyFunctionInfo &MFI,
+ MachineRegisterInfo &MRI) {
+ for (auto &MI : Split) {
+ for (auto &MO : MI.explicit_uses()) {
+ if (!MO.isReg() || Register::isPhysicalRegister(MO.getReg()))
+ continue;
+ if (MachineInstr *Def = MRI.getUniqueVRegDef(MO.getReg()))
+ if (Def->getParent() == &MBB)
+ MFI.unstackifyVReg(MO.getReg());
+ }
+ }
+}
+
bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
MachineRegisterInfo &MRI = MF.getRegInfo();
// Linearizing the control flow by placing TRY / END_TRY markers can create
@@ -830,7 +859,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
SmallVector<const MachineBasicBlock *, 8> EHPadStack;
// Range of intructions to be wrapped in a new nested try/catch
using TryRange = std::pair<MachineInstr *, MachineInstr *>;
- // In original CFG, <unwind destionation BB, a vector of try ranges>
+ // In original CFG, <unwind destination BB, a vector of try ranges>
DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> UnwindDestToTryRanges;
// In new CFG, <destination to branch to, a vector of try ranges>
DenseMap<MachineBasicBlock *, SmallVector<TryRange, 4>> BrDestToTryRanges;
@@ -936,7 +965,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
// of the function with a local.get and a rethrow instruction.
if (NeedAppendixBlock) {
auto *AppendixBB = getAppendixBlock(MF);
- unsigned ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
+ Register ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
BuildMI(AppendixBB, DebugLoc(), TII.get(WebAssembly::RETHROW))
.addReg(ExnReg);
// These instruction ranges should branch to this appendix BB.
@@ -967,7 +996,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
// ...
// cont:
for (auto &P : UnwindDestToTryRanges) {
- NumUnwindMismatches++;
+ NumUnwindMismatches += P.second.size();
// This means the destination is the appendix BB, which was separately
// handled above.
@@ -1007,6 +1036,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
BrDest->insert(BrDest->end(), EndTry->removeFromParent());
// Take out the handler body from EH pad to the new branch destination BB.
BrDest->splice(BrDest->end(), EHPad, SplitPos, EHPad->end());
+ unstackifyVRegsUsedInSplitBB(*EHPad, *BrDest, MFI, MRI);
// Fix predecessor-successor relationship.
BrDest->transferSuccessors(EHPad);
EHPad->addSuccessor(BrDest);
@@ -1100,7 +1130,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
MachineInstr *NestedTry =
BuildMI(*MBB, *RangeBegin, RangeBegin->getDebugLoc(),
TII.get(WebAssembly::TRY))
- .addImm(int64_t(WebAssembly::ExprType::Void));
+ .addImm(int64_t(WebAssembly::BlockType::Void));
// Create the nested EH pad and fill instructions in.
MachineBasicBlock *NestedEHPad = MF.CreateMachineBasicBlock();
@@ -1122,6 +1152,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
// new nested continuation BB.
NestedCont->splice(NestedCont->end(), MBB,
std::next(RangeEnd->getIterator()), MBB->end());
+ unstackifyVRegsUsedInSplitBB(*MBB, *NestedCont, MFI, MRI);
registerTryScope(NestedTry, NestedEndTry, NestedEHPad);
// Fix predecessor-successor relationship.
@@ -1197,54 +1228,32 @@ getDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
/// checks for such cases and fixes up the signatures.
void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) {
const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
- assert(MFI.getResults().size() <= 1);
if (MFI.getResults().empty())
return;
- WebAssembly::ExprType RetType;
- switch (MFI.getResults().front().SimpleTy) {
- case MVT::i32:
- RetType = WebAssembly::ExprType::I32;
- break;
- case MVT::i64:
- RetType = WebAssembly::ExprType::I64;
- break;
- case MVT::f32:
- RetType = WebAssembly::ExprType::F32;
- break;
- case MVT::f64:
- RetType = WebAssembly::ExprType::F64;
- break;
- case MVT::v16i8:
- case MVT::v8i16:
- case MVT::v4i32:
- case MVT::v2i64:
- case MVT::v4f32:
- case MVT::v2f64:
- RetType = WebAssembly::ExprType::V128;
- break;
- case MVT::exnref:
- RetType = WebAssembly::ExprType::Exnref;
- break;
- default:
- llvm_unreachable("unexpected return type");
- }
+ // MCInstLower will add the proper types to multivalue signatures based on the
+ // function return type
+ WebAssembly::BlockType RetType =
+ MFI.getResults().size() > 1
+ ? WebAssembly::BlockType::Multivalue
+ : WebAssembly::BlockType(
+ WebAssembly::toValType(MFI.getResults().front()));
for (MachineBasicBlock &MBB : reverse(MF)) {
for (MachineInstr &MI : reverse(MBB)) {
if (MI.isPosition() || MI.isDebugInstr())
continue;
- if (MI.getOpcode() == WebAssembly::END_BLOCK) {
- EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType));
- continue;
- }
- if (MI.getOpcode() == WebAssembly::END_LOOP) {
+ switch (MI.getOpcode()) {
+ case WebAssembly::END_BLOCK:
+ case WebAssembly::END_LOOP:
+ case WebAssembly::END_TRY:
EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType));
continue;
+ default:
+ // Something other than an `end`. We're done.
+ return;
}
- // Something other than an `end`. We're done.
- return;
}
}
}
@@ -1280,7 +1289,9 @@ void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) {
}
}
// Fix mismatches in unwind destinations induced by linearizing the code.
- fixUnwindMismatches(MF);
+ if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm &&
+ MF.getFunction().hasPersonalityFn())
+ fixUnwindMismatches(MF);
}
void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
diff --git a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index dbd62179f055..ef75bb215317 100644
--- a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -168,7 +168,7 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) {
static MachineInstr *findStartOfTree(MachineOperand &MO,
MachineRegisterInfo &MRI,
WebAssemblyFunctionInfo &MFI) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
assert(MFI.isVRegStackified(Reg));
MachineInstr *Def = MRI.getVRegDef(Reg);
@@ -207,7 +207,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
MachineInstr &MI = *I++;
if (!WebAssembly::isArgument(MI.getOpcode()))
break;
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
assert(!MFI.isVRegStackified(Reg));
Reg2Local[Reg] = static_cast<unsigned>(MI.getOperand(1).getImm());
MI.eraseFromParent();
@@ -221,7 +221,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
// drops to their defs.
BitVector UseEmpty(MRI.getNumVirtRegs());
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I)
- UseEmpty[I] = MRI.use_empty(TargetRegisterInfo::index2VirtReg(I));
+ UseEmpty[I] = MRI.use_empty(Register::index2VirtReg(I));
// Visit each instruction in the function.
for (MachineBasicBlock &MBB : MF) {
@@ -238,13 +238,13 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
if (WebAssembly::isTee(MI.getOpcode())) {
assert(MFI.isVRegStackified(MI.getOperand(0).getReg()));
assert(!MFI.isVRegStackified(MI.getOperand(1).getReg()));
- unsigned OldReg = MI.getOperand(2).getReg();
+ Register OldReg = MI.getOperand(2).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
// Stackify the input if it isn't stackified yet.
if (!MFI.isVRegStackified(OldReg)) {
unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
- unsigned NewReg = MRI.createVirtualRegister(RC);
+ Register NewReg = MRI.createVirtualRegister(RC);
unsigned Opc = getLocalGetOpcode(RC);
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc), NewReg)
.addImm(LocalId);
@@ -270,17 +270,17 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
// we handle at most one def.
assert(MI.getDesc().getNumDefs() <= 1);
if (MI.getDesc().getNumDefs() == 1) {
- unsigned OldReg = MI.getOperand(0).getReg();
+ Register OldReg = MI.getOperand(0).getReg();
if (!MFI.isVRegStackified(OldReg)) {
const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
- unsigned NewReg = MRI.createVirtualRegister(RC);
+ Register NewReg = MRI.createVirtualRegister(RC);
auto InsertPt = std::next(MI.getIterator());
if (MI.getOpcode() == WebAssembly::IMPLICIT_DEF) {
MI.eraseFromParent();
Changed = true;
continue;
}
- if (UseEmpty[TargetRegisterInfo::virtReg2Index(OldReg)]) {
+ if (UseEmpty[Register::virtReg2Index(OldReg)]) {
unsigned Opc = getDropOpcode(RC);
MachineInstr *Drop =
BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
@@ -310,7 +310,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
if (!MO.isReg())
continue;
- unsigned OldReg = MO.getReg();
+ Register OldReg = MO.getReg();
// Inline asm may have a def in the middle of the operands. Our contract
// with inline asm register operands is to provide local indices as
@@ -345,7 +345,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
// Insert a local.get.
unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
- unsigned NewReg = MRI.createVirtualRegister(RC);
+ Register NewReg = MRI.createVirtualRegister(RC);
unsigned Opc = getLocalGetOpcode(RC);
InsertPt =
BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc), NewReg)
@@ -369,7 +369,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
// TODO: Sort the locals for better compression.
MFI.setNumLocals(CurLocal - MFI.getParams().size());
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
- unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+ unsigned Reg = Register::index2VirtReg(I);
auto RL = Reg2Local.find(Reg);
if (RL == Reg2Local.end() || RL->second < MFI.getParams().size())
continue;
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 2552e9150833..c932f985489a 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -1141,14 +1141,14 @@ bool WebAssemblyFastISel::selectBitCast(const Instruction *I) {
return true;
}
- unsigned Reg = fastEmit_ISD_BITCAST_r(VT.getSimpleVT(), RetVT.getSimpleVT(),
+ Register Reg = fastEmit_ISD_BITCAST_r(VT.getSimpleVT(), RetVT.getSimpleVT(),
In, I->getOperand(0)->hasOneUse());
if (!Reg)
return false;
MachineBasicBlock::iterator Iter = FuncInfo.InsertPt;
--Iter;
assert(Iter->isBitcast());
- Iter->setPhysRegsDeadExcept(ArrayRef<unsigned>(), TRI);
+ Iter->setPhysRegsDeadExcept(ArrayRef<Register>(), TRI);
updateValueMap(I, Reg);
return true;
}
@@ -1302,51 +1302,33 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
if (Ret->getNumOperands() == 0) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(WebAssembly::RETURN_VOID));
+ TII.get(WebAssembly::RETURN));
return true;
}
+ // TODO: support multiple return in FastISel
+ if (Ret->getNumOperands() > 1)
+ return false;
+
Value *RV = Ret->getOperand(0);
if (!Subtarget->hasSIMD128() && RV->getType()->isVectorTy())
return false;
- unsigned Opc;
switch (getSimpleType(RV->getType())) {
case MVT::i1:
case MVT::i8:
case MVT::i16:
case MVT::i32:
- Opc = WebAssembly::RETURN_I32;
- break;
case MVT::i64:
- Opc = WebAssembly::RETURN_I64;
- break;
case MVT::f32:
- Opc = WebAssembly::RETURN_F32;
- break;
case MVT::f64:
- Opc = WebAssembly::RETURN_F64;
- break;
case MVT::v16i8:
- Opc = WebAssembly::RETURN_v16i8;
- break;
case MVT::v8i16:
- Opc = WebAssembly::RETURN_v8i16;
- break;
case MVT::v4i32:
- Opc = WebAssembly::RETURN_v4i32;
- break;
case MVT::v2i64:
- Opc = WebAssembly::RETURN_v2i64;
- break;
case MVT::v4f32:
- Opc = WebAssembly::RETURN_v4f32;
- break;
case MVT::v2f64:
- Opc = WebAssembly::RETURN_v2f64;
- break;
case MVT::exnref:
- Opc = WebAssembly::RETURN_EXNREF;
break;
default:
return false;
@@ -1363,7 +1345,9 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
if (Reg == 0)
return false;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)).addReg(Reg);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(WebAssembly::RETURN))
+ .addReg(Reg);
return true;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index b7fc65401fc4..6b1bbd7a2b07 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -70,6 +70,8 @@ static void findUses(Value *V, Function &F,
for (Use &U : V->uses()) {
if (auto *BC = dyn_cast<BitCastOperator>(U.getUser()))
findUses(BC, F, Uses, ConstantBCs);
+ else if (auto *A = dyn_cast<GlobalAlias>(U.getUser()))
+ findUses(A, F, Uses, ConstantBCs);
else if (U.get()->getType() != F.getType()) {
CallSite CS(U.getUser());
if (!CS)
diff --git a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
index 7d8e86d9b2c0..157ea9d525c9 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -56,6 +56,7 @@
#include "WebAssembly.h"
#include "WebAssemblySubtarget.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
using namespace llvm;
#define DEBUG_TYPE "wasm-fix-irreducible-control-flow"
@@ -358,7 +359,7 @@ void WebAssemblyFixIrreducibleControlFlow::makeSingleEntryLoop(
// Add the register which will be used to tell the jump table which block to
// jump to.
MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ Register Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
MIB.addReg(Reg);
// Compute the indices in the superheader, one for each bad block, and
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 5299068efdd4..71eeebfada4b 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -183,14 +183,14 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
bool HasBP = hasBP(MF);
if (HasBP) {
auto FI = MF.getInfo<WebAssemblyFunctionInfo>();
- unsigned BasePtr = MRI.createVirtualRegister(PtrRC);
+ Register BasePtr = MRI.createVirtualRegister(PtrRC);
FI->setBasePointerVreg(BasePtr);
BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY), BasePtr)
.addReg(SPReg);
}
if (StackSize) {
// Subtract the frame size
- unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
+ Register OffsetReg = MRI.createVirtualRegister(PtrRC);
BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
.addImm(StackSize);
BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::SUB_I32),
@@ -199,7 +199,7 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
.addReg(OffsetReg);
}
if (HasBP) {
- unsigned BitmaskReg = MRI.createVirtualRegister(PtrRC);
+ Register BitmaskReg = MRI.createVirtualRegister(PtrRC);
unsigned Alignment = MFI.getMaxAlignment();
assert((1u << countTrailingZeros(Alignment)) == Alignment &&
"Alignment must be a power of 2");
@@ -244,7 +244,7 @@ void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
} else if (StackSize) {
const TargetRegisterClass *PtrRC =
MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
- unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
+ Register OffsetReg = MRI.createVirtualRegister(PtrRC);
BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
.addImm(StackSize);
// In the epilog we don't need to write the result back to the SP32 physreg
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index daddd4ca16ff..fdc0f561dcd9 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -29,9 +29,9 @@ public:
static const size_t RedZoneSize = 128;
WebAssemblyFrameLowering()
- : TargetFrameLowering(StackGrowsDown, /*StackAlignment=*/16,
+ : TargetFrameLowering(StackGrowsDown, /*StackAlignment=*/Align(16),
/*LocalAreaOffset=*/0,
- /*TransientStackAlignment=*/16,
+ /*TransientStackAlignment=*/Align(16),
/*StackRealignable=*/true) {}
MachineBasicBlock::iterator
diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def
index 77217f16a727..13f0476eb4a5 100644
--- a/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -26,9 +26,11 @@ HANDLE_NODETYPE(WrapperPIC)
HANDLE_NODETYPE(BR_IF)
HANDLE_NODETYPE(BR_TABLE)
HANDLE_NODETYPE(SHUFFLE)
+HANDLE_NODETYPE(SWIZZLE)
HANDLE_NODETYPE(VEC_SHL)
HANDLE_NODETYPE(VEC_SHR_S)
HANDLE_NODETYPE(VEC_SHR_U)
+HANDLE_NODETYPE(LOAD_SPLAT)
HANDLE_NODETYPE(THROW)
HANDLE_NODETYPE(MEMORY_COPY)
HANDLE_NODETYPE(MEMORY_FILL)
diff --git a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 26339eaef37d..f83a8a984ae0 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -54,6 +54,12 @@ public:
ForCodeSize = MF.getFunction().hasOptSize();
Subtarget = &MF.getSubtarget<WebAssemblySubtarget>();
+
+ // Wasm64 is not fully supported right now (and is not specified)
+ if (Subtarget->hasAddr64())
+ report_fatal_error(
+ "64-bit WebAssembly (wasm64) is not currently supported");
+
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -88,88 +94,36 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
uint64_t SyncScopeID =
cast<ConstantSDNode>(Node->getOperand(2).getNode())->getZExtValue();
+ MachineSDNode *Fence = nullptr;
switch (SyncScopeID) {
- case SyncScope::SingleThread: {
+ case SyncScope::SingleThread:
// We lower a single-thread fence to a pseudo compiler barrier instruction
// preventing instruction reordering. This will not be emitted in final
// binary.
- MachineSDNode *Fence =
- CurDAG->getMachineNode(WebAssembly::COMPILER_FENCE,
- DL, // debug loc
- MVT::Other, // outchain type
- Node->getOperand(0) // inchain
- );
- ReplaceNode(Node, Fence);
- CurDAG->RemoveDeadNode(Node);
- return;
- }
-
- case SyncScope::System: {
- // For non-emscripten systems, we have not decided on what we should
- // traslate fences to yet.
- if (!Subtarget->getTargetTriple().isOSEmscripten())
- report_fatal_error(
- "ATOMIC_FENCE is not yet supported in non-emscripten OSes");
-
- // Wasm does not have a fence instruction, but because all atomic
- // instructions in wasm are sequentially consistent, we translate a
- // fence to an idempotent atomic RMW instruction to a linear memory
- // address. All atomic instructions in wasm are sequentially consistent,
- // but this is to ensure a fence also prevents reordering of non-atomic
- // instructions in the VM. Even though LLVM IR's fence instruction does
- // not say anything about its relationship with non-atomic instructions,
- // we think this is more user-friendly.
- //
- // While any address can work, here we use a value stored in
- // __stack_pointer wasm global because there's high chance that area is
- // in cache.
- //
- // So the selected instructions will be in the form of:
- // %addr = get_global $__stack_pointer
- // %0 = i32.const 0
- // i32.atomic.rmw.or %addr, %0
- SDValue StackPtrSym = CurDAG->getTargetExternalSymbol(
- "__stack_pointer", TLI->getPointerTy(CurDAG->getDataLayout()));
- MachineSDNode *GetGlobal =
- CurDAG->getMachineNode(WebAssembly::GLOBAL_GET_I32, // opcode
- DL, // debug loc
- MVT::i32, // result type
- StackPtrSym // __stack_pointer symbol
- );
-
- SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
- auto *MMO = MF.getMachineMemOperand(
- MachinePointerInfo::getUnknownStack(MF),
- // FIXME Volatile isn't really correct, but currently all LLVM
- // atomic instructions are treated as volatiles in the backend, so
- // we should be consistent.
- MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore,
- 4, 4, AAMDNodes(), nullptr, SyncScope::System,
- AtomicOrdering::SequentiallyConsistent);
- MachineSDNode *Const0 =
- CurDAG->getMachineNode(WebAssembly::CONST_I32, DL, MVT::i32, Zero);
- MachineSDNode *AtomicRMW = CurDAG->getMachineNode(
- WebAssembly::ATOMIC_RMW_OR_I32, // opcode
- DL, // debug loc
- MVT::i32, // result type
- MVT::Other, // outchain type
- {
- Zero, // alignment
- Zero, // offset
- SDValue(GetGlobal, 0), // __stack_pointer
- SDValue(Const0, 0), // OR with 0 to make it idempotent
- Node->getOperand(0) // inchain
- });
-
- CurDAG->setNodeMemRefs(AtomicRMW, {MMO});
- ReplaceUses(SDValue(Node, 0), SDValue(AtomicRMW, 1));
- CurDAG->RemoveDeadNode(Node);
- return;
- }
+ Fence = CurDAG->getMachineNode(WebAssembly::COMPILER_FENCE,
+ DL, // debug loc
+ MVT::Other, // outchain type
+ Node->getOperand(0) // inchain
+ );
+ break;
+ case SyncScope::System:
+ // Currently wasm only supports sequentially consistent atomics, so we
+ // always set the order to 0 (sequentially consistent).
+ Fence = CurDAG->getMachineNode(
+ WebAssembly::ATOMIC_FENCE,
+ DL, // debug loc
+ MVT::Other, // outchain type
+ CurDAG->getTargetConstant(0, DL, MVT::i32), // order
+ Node->getOperand(0) // inchain
+ );
+ break;
default:
llvm_unreachable("Unknown scope!");
}
+
+ ReplaceNode(Node, Fence);
+ CurDAG->RemoveDeadNode(Node);
+ return;
}
case ISD::GlobalTLSAddress: {
@@ -224,6 +178,33 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, TLSSize);
return;
}
+ case Intrinsic::wasm_tls_align: {
+ MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
+ assert(PtrVT == MVT::i32 && "only wasm32 is supported for now");
+
+ MachineSDNode *TLSAlign = CurDAG->getMachineNode(
+ WebAssembly::GLOBAL_GET_I32, DL, PtrVT,
+ CurDAG->getTargetExternalSymbol("__tls_align", MVT::i32));
+ ReplaceNode(Node, TLSAlign);
+ return;
+ }
+ }
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ case Intrinsic::wasm_tls_base: {
+ MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
+ assert(PtrVT == MVT::i32 && "only wasm32 is supported for now");
+
+ MachineSDNode *TLSBase = CurDAG->getMachineNode(
+ WebAssembly::GLOBAL_GET_I32, DL, MVT::i32, MVT::Other,
+ CurDAG->getTargetExternalSymbol("__tls_base", PtrVT),
+ Node->getOperand(0));
+ ReplaceNode(Node, TLSBase);
+ return;
+ }
}
break;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4064a983099c..f06afdbcea9e 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -205,7 +205,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
for (auto T : {MVT::i8, MVT::i16, MVT::i32})
setOperationAction(ISD::SIGN_EXTEND_INREG, T, Action);
}
- for (auto T : MVT::integer_vector_valuetypes())
+ for (auto T : MVT::integer_fixedlen_vector_valuetypes())
setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
// Dynamic stack allocation: use the default expansion.
@@ -228,7 +228,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// - Floating-point extending loads.
// - Floating-point truncating stores.
// - i1 extending loads.
- // - extending/truncating SIMD loads/stores
+ // - truncating SIMD stores and most extending loads
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
for (auto T : MVT::integer_valuetypes())
@@ -237,7 +237,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
if (Subtarget->hasSIMD128()) {
for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v4f32,
MVT::v2f64}) {
- for (auto MemT : MVT::vector_valuetypes()) {
+ for (auto MemT : MVT::fixedlen_vector_valuetypes()) {
if (MVT(T) != MemT) {
setTruncStoreAction(T, MemT, Expand);
for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD})
@@ -245,6 +245,14 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
}
}
}
+ // But some vector extending loads are legal
+ if (Subtarget->hasUnimplementedSIMD128()) {
+ for (auto Ext : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
+ setLoadExtAction(Ext, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal);
+ }
+ }
}
// Don't do anything clever with build_pairs
@@ -259,16 +267,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setMaxAtomicSizeInBitsSupported(64);
- if (Subtarget->hasBulkMemory()) {
- // Use memory.copy and friends over multiple loads and stores
- MaxStoresPerMemcpy = 1;
- MaxStoresPerMemcpyOptSize = 1;
- MaxStoresPerMemmove = 1;
- MaxStoresPerMemmoveOptSize = 1;
- MaxStoresPerMemset = 1;
- MaxStoresPerMemsetOptSize = 1;
- }
-
// Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is
// consistent with the f64 and f128 names.
setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
@@ -337,8 +335,8 @@ static MachineBasicBlock *LowerFPToInt(MachineInstr &MI, DebugLoc DL,
bool Float64, unsigned LoweredOpcode) {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- unsigned OutReg = MI.getOperand(0).getReg();
- unsigned InReg = MI.getOperand(1).getReg();
+ Register OutReg = MI.getOperand(0).getReg();
+ Register InReg = MI.getOperand(1).getReg();
unsigned Abs = Float64 ? WebAssembly::ABS_F64 : WebAssembly::ABS_F32;
unsigned FConst = Float64 ? WebAssembly::CONST_F64 : WebAssembly::CONST_F32;
@@ -396,9 +394,9 @@ static MachineBasicBlock *LowerFPToInt(MachineInstr &MI, DebugLoc DL,
// For unsigned numbers, we have to do a separate comparison with zero.
if (IsUnsigned) {
Tmp1 = MRI.createVirtualRegister(MRI.getRegClass(InReg));
- unsigned SecondCmpReg =
+ Register SecondCmpReg =
MRI.createVirtualRegister(&WebAssembly::I32RegClass);
- unsigned AndReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ Register AndReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
BuildMI(BB, DL, TII.get(FConst), Tmp1)
.addFPImm(cast<ConstantFP>(ConstantFP::get(Ty, 0.0)));
BuildMI(BB, DL, TII.get(GE), SecondCmpReg).addReg(Tmp0).addReg(Tmp1);
@@ -550,6 +548,16 @@ bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT,
return true;
}
+bool WebAssemblyTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+ if (!Subtarget->hasUnimplementedSIMD128())
+ return false;
+ MVT ExtT = ExtVal.getSimpleValueType();
+ MVT MemT = cast<LoadSDNode>(ExtVal->getOperand(0))->getSimpleValueType(0);
+ return (ExtT == MVT::v8i16 && MemT == MVT::v8i8) ||
+ (ExtT == MVT::v4i32 && MemT == MVT::v4i16) ||
+ (ExtT == MVT::v2i64 && MemT == MVT::v2i32);
+}
+
EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext &C,
EVT VT) const {
@@ -569,7 +577,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::i32;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = 4;
+ Info.align = Align(4);
// atomic.notify instruction does not really load the memory specified with
// this argument, but MachineMemOperand should either be load or store, so
// we set this to a load.
@@ -583,7 +591,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::i32;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = 4;
+ Info.align = Align(4);
Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
return true;
case Intrinsic::wasm_atomic_wait_i64:
@@ -591,7 +599,7 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::i64;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = 8;
+ Info.align = Align(8);
Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
return true;
default:
@@ -623,7 +631,8 @@ static bool callingConvSupported(CallingConv::ID CallConv) {
CallConv == CallingConv::Cold ||
CallConv == CallingConv::PreserveMost ||
CallConv == CallingConv::PreserveAll ||
- CallConv == CallingConv::CXX_FAST_TLS;
+ CallConv == CallingConv::CXX_FAST_TLS ||
+ CallConv == CallingConv::WASM_EmscriptenInvoke;
}
SDValue
@@ -644,13 +653,36 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
if (CLI.IsPatchPoint)
fail(DL, DAG, "WebAssembly doesn't support patch point yet");
- // Fail if tail calls are required but not enabled
- if (!Subtarget->hasTailCall()) {
- if ((CallConv == CallingConv::Fast && CLI.IsTailCall &&
- MF.getTarget().Options.GuaranteedTailCallOpt) ||
- (CLI.CS && CLI.CS.isMustTailCall()))
- fail(DL, DAG, "WebAssembly 'tail-call' feature not enabled");
- CLI.IsTailCall = false;
+ if (CLI.IsTailCall) {
+ bool MustTail = CLI.CS && CLI.CS.isMustTailCall();
+ if (Subtarget->hasTailCall() && !CLI.IsVarArg) {
+ // Do not tail call unless caller and callee return types match
+ const Function &F = MF.getFunction();
+ const TargetMachine &TM = getTargetMachine();
+ Type *RetTy = F.getReturnType();
+ SmallVector<MVT, 4> CallerRetTys;
+ SmallVector<MVT, 4> CalleeRetTys;
+ computeLegalValueVTs(F, TM, RetTy, CallerRetTys);
+ computeLegalValueVTs(F, TM, CLI.RetTy, CalleeRetTys);
+ bool TypesMatch = CallerRetTys.size() == CalleeRetTys.size() &&
+ std::equal(CallerRetTys.begin(), CallerRetTys.end(),
+ CalleeRetTys.begin());
+ if (!TypesMatch) {
+ // musttail in this case would be an LLVM IR validation failure
+ assert(!MustTail);
+ CLI.IsTailCall = false;
+ }
+ } else {
+ CLI.IsTailCall = false;
+ if (MustTail) {
+ if (CLI.IsVarArg) {
+ // The return would pop the argument buffer
+ fail(DL, DAG, "WebAssembly does not support varargs tail calls");
+ } else {
+ fail(DL, DAG, "WebAssembly 'tail-call' feature not enabled");
+ }
+ }
+ }
}
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
@@ -659,6 +691,16 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+
+ // The generic code may have added an sret argument. If we're lowering an
+ // invoke function, the ABI requires that the function pointer be the first
+ // argument, so we may have to swap the arguments.
+ if (CallConv == CallingConv::WASM_EmscriptenInvoke && Outs.size() >= 2 &&
+ Outs[0].Flags.isSRet()) {
+ std::swap(Outs[0], Outs[1]);
+ std::swap(OutVals[0], OutVals[1]);
+ }
+
unsigned NumFixedArgs = 0;
for (unsigned I = 0; I < Outs.size(); ++I) {
const ISD::OutputArg &Out = Outs[I];
@@ -810,8 +852,8 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
CallingConv::ID /*CallConv*/, MachineFunction & /*MF*/, bool /*IsVarArg*/,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext & /*Context*/) const {
- // WebAssembly can't currently handle returning tuples.
- return Outs.size() <= 1;
+ // WebAssembly can only handle returning tuples with multivalue enabled
+ return Subtarget->hasMultivalue() || Outs.size() <= 1;
}
SDValue WebAssemblyTargetLowering::LowerReturn(
@@ -819,7 +861,8 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const {
- assert(Outs.size() <= 1 && "WebAssembly can only return up to one value");
+ assert((Subtarget->hasMultivalue() || Outs.size() <= 1) &&
+ "MVP WebAssembly can only return up to one value");
if (!callingConvSupported(CallConv))
fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
@@ -881,7 +924,7 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
// the buffer is passed as an argument.
if (IsVarArg) {
MVT PtrVT = getPointerTy(MF.getDataLayout());
- unsigned VarargVreg =
+ Register VarargVreg =
MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrVT));
MFI->setVarargBufferVreg(VarargVreg);
Chain = DAG.getCopyToReg(
@@ -1022,8 +1065,9 @@ SDValue WebAssemblyTargetLowering::LowerRETURNADDR(SDValue Op,
return SDValue();
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ MakeLibCallOptions CallOptions;
return makeLibCall(DAG, RTLIB::RETURN_ADDRESS, Op.getValueType(),
- {DAG.getConstant(Depth, DL, MVT::i32)}, false, DL)
+ {DAG.getConstant(Depth, DL, MVT::i32)}, CallOptions, DL)
.first;
}
@@ -1037,7 +1081,7 @@ SDValue WebAssemblyTargetLowering::LowerFRAMEADDR(SDValue Op,
DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
EVT VT = Op.getValueType();
- unsigned FP =
+ Register FP =
Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction());
return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), FP, VT);
}
@@ -1249,68 +1293,116 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
const EVT VecT = Op.getValueType();
const EVT LaneT = Op.getOperand(0).getValueType();
const size_t Lanes = Op.getNumOperands();
+ bool CanSwizzle = Subtarget->hasUnimplementedSIMD128() && VecT == MVT::v16i8;
+
+ // BUILD_VECTORs are lowered to the instruction that initializes the highest
+ // possible number of lanes at once followed by a sequence of replace_lane
+ // instructions to individually initialize any remaining lanes.
+
+ // TODO: Tune this. For example, lanewise swizzling is very expensive, so
+ // swizzled lanes should be given greater weight.
+
+ // TODO: Investigate building vectors by shuffling together vectors built by
+ // separately specialized means.
+
auto IsConstant = [](const SDValue &V) {
return V.getOpcode() == ISD::Constant || V.getOpcode() == ISD::ConstantFP;
};
- // Find the most common operand, which is approximately the best to splat
- using Entry = std::pair<SDValue, size_t>;
- SmallVector<Entry, 16> ValueCounts;
- size_t NumConst = 0, NumDynamic = 0;
- for (const SDValue &Lane : Op->op_values()) {
- if (Lane.isUndef()) {
- continue;
- } else if (IsConstant(Lane)) {
- NumConst++;
- } else {
- NumDynamic++;
- }
- auto CountIt = std::find_if(ValueCounts.begin(), ValueCounts.end(),
- [&Lane](Entry A) { return A.first == Lane; });
- if (CountIt == ValueCounts.end()) {
- ValueCounts.emplace_back(Lane, 1);
+ // Returns the source vector and index vector pair if they exist. Checks for:
+ // (extract_vector_elt
+ // $src,
+ // (sign_extend_inreg (extract_vector_elt $indices, $i))
+ // )
+ auto GetSwizzleSrcs = [](size_t I, const SDValue &Lane) {
+ auto Bail = std::make_pair(SDValue(), SDValue());
+ if (Lane->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return Bail;
+ const SDValue &SwizzleSrc = Lane->getOperand(0);
+ const SDValue &IndexExt = Lane->getOperand(1);
+ if (IndexExt->getOpcode() != ISD::SIGN_EXTEND_INREG)
+ return Bail;
+ const SDValue &Index = IndexExt->getOperand(0);
+ if (Index->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return Bail;
+ const SDValue &SwizzleIndices = Index->getOperand(0);
+ if (SwizzleSrc.getValueType() != MVT::v16i8 ||
+ SwizzleIndices.getValueType() != MVT::v16i8 ||
+ Index->getOperand(1)->getOpcode() != ISD::Constant ||
+ Index->getConstantOperandVal(1) != I)
+ return Bail;
+ return std::make_pair(SwizzleSrc, SwizzleIndices);
+ };
+
+ using ValueEntry = std::pair<SDValue, size_t>;
+ SmallVector<ValueEntry, 16> SplatValueCounts;
+
+ using SwizzleEntry = std::pair<std::pair<SDValue, SDValue>, size_t>;
+ SmallVector<SwizzleEntry, 16> SwizzleCounts;
+
+ auto AddCount = [](auto &Counts, const auto &Val) {
+ auto CountIt = std::find_if(Counts.begin(), Counts.end(),
+ [&Val](auto E) { return E.first == Val; });
+ if (CountIt == Counts.end()) {
+ Counts.emplace_back(Val, 1);
} else {
CountIt->second++;
}
+ };
+
+ auto GetMostCommon = [](auto &Counts) {
+ auto CommonIt =
+ std::max_element(Counts.begin(), Counts.end(),
+ [](auto A, auto B) { return A.second < B.second; });
+ assert(CommonIt != Counts.end() && "Unexpected all-undef build_vector");
+ return *CommonIt;
+ };
+
+ size_t NumConstantLanes = 0;
+
+ // Count eligible lanes for each type of vector creation op
+ for (size_t I = 0; I < Lanes; ++I) {
+ const SDValue &Lane = Op->getOperand(I);
+ if (Lane.isUndef())
+ continue;
+
+ AddCount(SplatValueCounts, Lane);
+
+ if (IsConstant(Lane)) {
+ NumConstantLanes++;
+ } else if (CanSwizzle) {
+ auto SwizzleSrcs = GetSwizzleSrcs(I, Lane);
+ if (SwizzleSrcs.first)
+ AddCount(SwizzleCounts, SwizzleSrcs);
+ }
}
- auto CommonIt =
- std::max_element(ValueCounts.begin(), ValueCounts.end(),
- [](Entry A, Entry B) { return A.second < B.second; });
- assert(CommonIt != ValueCounts.end() && "Unexpected all-undef build_vector");
- SDValue SplatValue = CommonIt->first;
- size_t NumCommon = CommonIt->second;
-
- // If v128.const is available, consider using it instead of a splat
+
+ SDValue SplatValue;
+ size_t NumSplatLanes;
+ std::tie(SplatValue, NumSplatLanes) = GetMostCommon(SplatValueCounts);
+
+ SDValue SwizzleSrc;
+ SDValue SwizzleIndices;
+ size_t NumSwizzleLanes = 0;
+ if (SwizzleCounts.size())
+ std::forward_as_tuple(std::tie(SwizzleSrc, SwizzleIndices),
+ NumSwizzleLanes) = GetMostCommon(SwizzleCounts);
+
+ // Predicate returning true if the lane is properly initialized by the
+ // original instruction
+ std::function<bool(size_t, const SDValue &)> IsLaneConstructed;
+ SDValue Result;
if (Subtarget->hasUnimplementedSIMD128()) {
- // {i32,i64,f32,f64}.const opcode, and value
- const size_t ConstBytes = 1 + std::max(size_t(4), 16 / Lanes);
- // SIMD prefix and opcode
- const size_t SplatBytes = 2;
- const size_t SplatConstBytes = SplatBytes + ConstBytes;
- // SIMD prefix, opcode, and lane index
- const size_t ReplaceBytes = 3;
- const size_t ReplaceConstBytes = ReplaceBytes + ConstBytes;
- // SIMD prefix, v128.const opcode, and 128-bit value
- const size_t VecConstBytes = 18;
- // Initial v128.const and a replace_lane for each non-const operand
- const size_t ConstInitBytes = VecConstBytes + NumDynamic * ReplaceBytes;
- // Initial splat and all necessary replace_lanes
- const size_t SplatInitBytes =
- IsConstant(SplatValue)
- // Initial constant splat
- ? (SplatConstBytes +
- // Constant replace_lanes
- (NumConst - NumCommon) * ReplaceConstBytes +
- // Dynamic replace_lanes
- (NumDynamic * ReplaceBytes))
- // Initial dynamic splat
- : (SplatBytes +
- // Constant replace_lanes
- (NumConst * ReplaceConstBytes) +
- // Dynamic replace_lanes
- (NumDynamic - NumCommon) * ReplaceBytes);
- if (ConstInitBytes < SplatInitBytes) {
- // Create build_vector that will lower to initial v128.const
+ // Prefer swizzles over vector consts over splats
+ if (NumSwizzleLanes >= NumSplatLanes &&
+ NumSwizzleLanes >= NumConstantLanes) {
+ Result = DAG.getNode(WebAssemblyISD::SWIZZLE, DL, VecT, SwizzleSrc,
+ SwizzleIndices);
+ auto Swizzled = std::make_pair(SwizzleSrc, SwizzleIndices);
+ IsLaneConstructed = [&, Swizzled](size_t I, const SDValue &Lane) {
+ return Swizzled == GetSwizzleSrcs(I, Lane);
+ };
+ } else if (NumConstantLanes >= NumSplatLanes) {
SmallVector<SDValue, 16> ConstLanes;
for (const SDValue &Lane : Op->op_values()) {
if (IsConstant(Lane)) {
@@ -1321,26 +1413,35 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
ConstLanes.push_back(DAG.getConstant(0, DL, LaneT));
}
}
- SDValue Result = DAG.getBuildVector(VecT, DL, ConstLanes);
- // Add replace_lane instructions for non-const lanes
- for (size_t I = 0; I < Lanes; ++I) {
- const SDValue &Lane = Op->getOperand(I);
- if (!Lane.isUndef() && !IsConstant(Lane))
- Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane,
- DAG.getConstant(I, DL, MVT::i32));
- }
- return Result;
+ Result = DAG.getBuildVector(VecT, DL, ConstLanes);
+ IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+ return IsConstant(Lane);
+ };
+ }
+ }
+ if (!Result) {
+ // Use a splat, but possibly a load_splat
+ LoadSDNode *SplattedLoad;
+ if (Subtarget->hasUnimplementedSIMD128() &&
+ (SplattedLoad = dyn_cast<LoadSDNode>(SplatValue)) &&
+ SplattedLoad->getMemoryVT() == VecT.getVectorElementType()) {
+ Result = DAG.getNode(WebAssemblyISD::LOAD_SPLAT, DL, VecT, SplatValue);
+ } else {
+ Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
}
+ IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+ return Lane == SplatValue;
+ };
}
- // Use a splat for the initial vector
- SDValue Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
- // Add replace_lane instructions for other values
+
+ // Add replace_lane instructions for any unhandled values
for (size_t I = 0; I < Lanes; ++I) {
const SDValue &Lane = Op->getOperand(I);
- if (Lane != SplatValue)
+ if (!Lane.isUndef() && !IsLaneConstructed(I, Lane))
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecT, Result, Lane,
DAG.getConstant(I, DL, MVT::i32));
}
+
return Result;
}
@@ -1415,11 +1516,6 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
// Only manually lower vector shifts
assert(Op.getSimpleValueType().isVector());
- // Expand all vector shifts until V8 fixes its implementation
- // TODO: remove this once V8 is fixed
- if (!Subtarget->hasUnimplementedSIMD128())
- return unrollVectorShift(Op, DAG);
-
// Unroll non-splat vector shifts
BuildVectorSDNode *ShiftVec;
SDValue SplatVal;
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index b3c7f3defd5f..a53e24a05542 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -63,7 +63,7 @@ private:
MachineMemOperand::Flags Flags,
bool *Fast) const override;
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
-
+ bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index e85aa57efc42..a9a99d38f9f1 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -71,12 +71,6 @@ class NotifyPatImmOff<PatFrag operand> :
def : NotifyPatImmOff<regPlusImm>;
def : NotifyPatImmOff<or_is_add>;
-def NotifyPatGlobalAddr :
- Pat<(i32 (int_wasm_atomic_notify (regPlusGA I32:$addr,
- (WebAssemblywrapper tglobaladdr:$off)),
- I32:$count)),
- (ATOMIC_NOTIFY 0, tglobaladdr:$off, I32:$addr, I32:$count)>;
-
// Select notifys with just a constant offset.
def NotifyPatOffsetOnly :
Pat<(i32 (int_wasm_atomic_notify imm:$off, I32:$count)),
@@ -105,13 +99,6 @@ def : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, or_is_add, ATOMIC_WAIT_I32>;
def : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, regPlusImm, ATOMIC_WAIT_I64>;
def : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, or_is_add, ATOMIC_WAIT_I64>;
-class WaitPatGlobalAddr<ValueType ty, Intrinsic kind, NI inst> :
- Pat<(i32 (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
- ty:$exp, I64:$timeout)),
- (inst 0, tglobaladdr:$off, I32:$addr, ty:$exp, I64:$timeout)>;
-def : WaitPatGlobalAddr<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
-def : WaitPatGlobalAddr<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
-
// Select wait_i32, ATOMIC_WAIT_I32s with just a constant offset.
class WaitPatOffsetOnly<ValueType ty, Intrinsic kind, NI inst> :
Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)),
@@ -127,6 +114,19 @@ def : WaitPatGlobalAddrOffOnly<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
} // Predicates = [HasAtomics]
//===----------------------------------------------------------------------===//
+// Atomic fences
+//===----------------------------------------------------------------------===//
+
+// A compiler fence instruction that prevents reordering of instructions.
+let Defs = [ARGUMENTS] in {
+let isPseudo = 1, hasSideEffects = 1 in
+defm COMPILER_FENCE : ATOMIC_NRI<(outs), (ins), [], "compiler_fence">;
+let hasSideEffects = 1 in
+defm ATOMIC_FENCE : ATOMIC_NRI<(outs), (ins i8imm:$flags), [], "atomic.fence",
+ 0x03>;
+} // Defs = [ARGUMENTS]
+
+//===----------------------------------------------------------------------===//
// Atomic loads
//===----------------------------------------------------------------------===//
@@ -151,9 +151,6 @@ def : LoadPatImmOff<i64, atomic_load_64, regPlusImm, ATOMIC_LOAD_I64>;
def : LoadPatImmOff<i32, atomic_load_32, or_is_add, ATOMIC_LOAD_I32>;
def : LoadPatImmOff<i64, atomic_load_64, or_is_add, ATOMIC_LOAD_I64>;
-def : LoadPatGlobalAddr<i32, atomic_load_32, ATOMIC_LOAD_I32>;
-def : LoadPatGlobalAddr<i64, atomic_load_64, ATOMIC_LOAD_I64>;
-
// Select loads with just a constant offset.
def : LoadPatOffsetOnly<i32, atomic_load_32, ATOMIC_LOAD_I32>;
def : LoadPatOffsetOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
@@ -244,16 +241,6 @@ def : LoadPatImmOff<i64, sext_aload_8_64, or_is_add, ATOMIC_LOAD8_U_I64>;
def : LoadPatImmOff<i64, sext_aload_16_64, or_is_add, ATOMIC_LOAD16_U_I64>;
// No 32->64 patterns, just use i32.atomic.load and i64.extend_s/i64
-def : LoadPatGlobalAddr<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
-def : LoadPatGlobalAddr<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
-def : LoadPatGlobalAddr<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatGlobalAddr<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-def : LoadPatGlobalAddr<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
-def : LoadPatGlobalAddr<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatGlobalAddr<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatGlobalAddr<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatGlobalAddr<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-
// Extending loads with just a constant offset
def : LoadPatOffsetOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
def : LoadPatOffsetOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
@@ -313,13 +300,6 @@ def : AStorePatImmOff<i64, atomic_store_64, regPlusImm, ATOMIC_STORE_I64>;
def : AStorePatImmOff<i32, atomic_store_32, or_is_add, ATOMIC_STORE_I32>;
def : AStorePatImmOff<i64, atomic_store_64, or_is_add, ATOMIC_STORE_I64>;
-class AStorePatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
- Pat<(kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
- ty:$val),
- (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>;
-def : AStorePatGlobalAddr<i32, atomic_store_32, ATOMIC_STORE_I32>;
-def : AStorePatGlobalAddr<i64, atomic_store_64, ATOMIC_STORE_I64>;
-
// Select stores with just a constant offset.
class AStorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(kind imm:$off, ty:$val), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
@@ -374,12 +354,6 @@ def : AStorePatImmOff<i64, trunc_astore_8_64, or_is_add, ATOMIC_STORE8_I64>;
def : AStorePatImmOff<i64, trunc_astore_16_64, or_is_add, ATOMIC_STORE16_I64>;
def : AStorePatImmOff<i64, trunc_astore_32_64, or_is_add, ATOMIC_STORE32_I64>;
-def : AStorePatGlobalAddr<i32, atomic_store_8, ATOMIC_STORE8_I32>;
-def : AStorePatGlobalAddr<i32, atomic_store_16, ATOMIC_STORE16_I32>;
-def : AStorePatGlobalAddr<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
-def : AStorePatGlobalAddr<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
-def : AStorePatGlobalAddr<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
-
// Truncating stores with just a constant offset
def : AStorePatOffsetOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>;
def : AStorePatOffsetOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>;
@@ -500,11 +474,6 @@ class BinRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$val)),
(inst 0, imm:$off, I32:$addr, ty:$val)>;
-class BinRMWPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
- Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
- ty:$val)),
- (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>;
-
// Select binary RMWs with just a constant offset.
class BinRMWPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(ty (kind imm:$off, ty:$val)),
@@ -525,9 +494,6 @@ multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32,
def : BinRMWPatImmOff<i32, rmw_32, or_is_add, inst_32>;
def : BinRMWPatImmOff<i64, rmw_64, or_is_add, inst_64>;
- def : BinRMWPatGlobalAddr<i32, rmw_32, inst_32>;
- def : BinRMWPatGlobalAddr<i64, rmw_64, inst_64>;
-
def : BinRMWPatOffsetOnly<i32, rmw_32, inst_32>;
def : BinRMWPatOffsetOnly<i64, rmw_64, inst_64>;
@@ -622,17 +588,6 @@ multiclass BinRMWTruncExtPattern<
def : BinRMWPatImmOff<i64, sext_bin_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
def : BinRMWPatImmOff<i64, sext_bin_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
- def : BinRMWPatGlobalAddr<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
- def : BinRMWPatGlobalAddr<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
- def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
- def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
- def : BinRMWPatGlobalAddr<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
-
- def : BinRMWPatGlobalAddr<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
- def : BinRMWPatGlobalAddr<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
- def : BinRMWPatGlobalAddr<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
- def : BinRMWPatGlobalAddr<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
-
// Truncating-extending binary RMWs with just a constant offset
def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
@@ -732,11 +687,6 @@ class TerRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$exp, ty:$new)),
(inst 0, imm:$off, I32:$addr, ty:$exp, ty:$new)>;
-class TerRMWPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
- Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)),
- ty:$exp, ty:$new)),
- (inst 0, tglobaladdr:$off, I32:$addr, ty:$exp, ty:$new)>;
-
// Select ternary RMWs with just a constant offset.
class TerRMWPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(ty (kind imm:$off, ty:$exp, ty:$new)),
@@ -757,9 +707,6 @@ multiclass TerRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32,
def : TerRMWPatImmOff<i32, rmw_32, or_is_add, inst_32>;
def : TerRMWPatImmOff<i64, rmw_64, or_is_add, inst_64>;
- def : TerRMWPatGlobalAddr<i32, rmw_32, inst_32>;
- def : TerRMWPatGlobalAddr<i64, rmw_64, inst_64>;
-
def : TerRMWPatOffsetOnly<i32, rmw_32, inst_32>;
def : TerRMWPatOffsetOnly<i64, rmw_64, inst_64>;
@@ -846,17 +793,6 @@ multiclass TerRMWTruncExtPattern<
def : TerRMWPatImmOff<i64, sext_ter_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
def : TerRMWPatImmOff<i64, sext_ter_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
- def : TerRMWPatGlobalAddr<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
- def : TerRMWPatGlobalAddr<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
- def : TerRMWPatGlobalAddr<i64, zext_ter_rmw_8_64<rmw_8>, inst8_64>;
- def : TerRMWPatGlobalAddr<i64, zext_ter_rmw_16_64<rmw_16>, inst16_64>;
- def : TerRMWPatGlobalAddr<i64, zext_ter_rmw_32_64<rmw_32>, inst32_64>;
-
- def : TerRMWPatGlobalAddr<i32, sext_ter_rmw_8_32<rmw_8>, inst8_32>;
- def : TerRMWPatGlobalAddr<i32, sext_ter_rmw_16_32<rmw_16>, inst16_32>;
- def : TerRMWPatGlobalAddr<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>;
- def : TerRMWPatGlobalAddr<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
-
// Truncating-extending ternary RMWs with just a constant offset
def : TerRMWPatOffsetOnly<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
def : TerRMWPatOffsetOnly<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
@@ -887,13 +823,3 @@ defm : TerRMWTruncExtPattern<
ATOMIC_RMW8_U_CMPXCHG_I32, ATOMIC_RMW16_U_CMPXCHG_I32,
ATOMIC_RMW8_U_CMPXCHG_I64, ATOMIC_RMW16_U_CMPXCHG_I64,
ATOMIC_RMW32_U_CMPXCHG_I64>;
-
-//===----------------------------------------------------------------------===//
-// Atomic fences
-//===----------------------------------------------------------------------===//
-
-// A compiler fence instruction that prevents reordering of instructions.
-let Defs = [ARGUMENTS] in {
-let isPseudo = 1, hasSideEffects = 1 in
-defm COMPILER_FENCE : ATOMIC_NRI<(outs), (ins), [], "compiler_fence">;
-} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
index f4352e3d12ec..05735cf6d31f 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
@@ -39,7 +39,7 @@ defm MEMORY_INIT :
(ins i32imm_op:$seg, i32imm_op:$idx, I32:$dest,
I32:$offset, I32:$size),
(outs), (ins i32imm_op:$seg, i32imm_op:$idx),
- [(int_wasm_memory_init (i32 imm:$seg), (i32 imm:$idx), I32:$dest,
+ [(int_wasm_memory_init (i32 timm:$seg), (i32 timm:$idx), I32:$dest,
I32:$offset, I32:$size
)],
"memory.init\t$seg, $idx, $dest, $offset, $size",
@@ -48,7 +48,7 @@ defm MEMORY_INIT :
let hasSideEffects = 1 in
defm DATA_DROP :
BULK_I<(outs), (ins i32imm_op:$seg), (outs), (ins i32imm_op:$seg),
- [(int_wasm_data_drop (i32 imm:$seg))],
+ [(int_wasm_data_drop (i32 timm:$seg))],
"data.drop\t$seg", "data.drop\t$seg", 0x09>;
let mayLoad = 1, mayStore = 1 in
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 1870c5bc34b0..1afc9a8790dc 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -84,49 +84,19 @@ let isTerminator = 1, isBarrier = 1 in
defm END_FUNCTION : NRI<(outs), (ins), [], "end_function", 0x0b>;
} // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
-multiclass RETURN<WebAssemblyRegClass vt> {
- defm RETURN_#vt : I<(outs), (ins vt:$val), (outs), (ins),
- [(WebAssemblyreturn vt:$val)],
- "return \t$val", "return", 0x0f>;
- // Equivalent to RETURN_#vt, for use at the end of a function when wasm
- // semantics return by falling off the end of the block.
- let isCodeGenOnly = 1 in
- defm FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), (outs), (ins), []>;
-}
-
-multiclass SIMD_RETURN<ValueType vt> {
- defm RETURN_#vt : I<(outs), (ins V128:$val), (outs), (ins),
- [(WebAssemblyreturn (vt V128:$val))],
- "return \t$val", "return", 0x0f>,
- Requires<[HasSIMD128]>;
- // Equivalent to RETURN_#vt, for use at the end of a function when wasm
- // semantics return by falling off the end of the block.
- let isCodeGenOnly = 1 in
- defm FALLTHROUGH_RETURN_#vt : I<(outs), (ins V128:$val), (outs), (ins),
- []>,
- Requires<[HasSIMD128]>;
-}
let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
let isReturn = 1 in {
- defm "": RETURN<I32>;
- defm "": RETURN<I64>;
- defm "": RETURN<F32>;
- defm "": RETURN<F64>;
- defm "": RETURN<EXNREF>;
- defm "": SIMD_RETURN<v16i8>;
- defm "": SIMD_RETURN<v8i16>;
- defm "": SIMD_RETURN<v4i32>;
- defm "": SIMD_RETURN<v2i64>;
- defm "": SIMD_RETURN<v4f32>;
- defm "": SIMD_RETURN<v2f64>;
-
- defm RETURN_VOID : NRI<(outs), (ins), [(WebAssemblyreturn)], "return", 0x0f>;
-
- // This is to RETURN_VOID what FALLTHROUGH_RETURN_#vt is to RETURN_#vt.
- let isCodeGenOnly = 1 in
- defm FALLTHROUGH_RETURN_VOID : NRI<(outs), (ins), []>;
+
+defm RETURN : I<(outs), (ins variable_ops), (outs), (ins),
+ [(WebAssemblyreturn)],
+ "return", "return", 0x0f>;
+// Equivalent to RETURN, for use at the end of a function when wasm
+// semantics return by falling off the end of the block.
+let isCodeGenOnly = 1 in
+defm FALLTHROUGH_RETURN : I<(outs), (ins variable_ops), (outs), (ins), []>;
+
} // isReturn = 1
defm UNREACHABLE : NRI<(outs), (ins), [(trap)], "unreachable", 0x00>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index 661fee2715ba..f3d9c5d5032c 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -171,6 +171,23 @@ defm I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
0xb1>;
} // hasSideEffects = 1
+def : Pat<(int_wasm_trunc_signed F32:$src),
+ (I32_TRUNC_S_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_unsigned F32:$src),
+ (I32_TRUNC_U_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_signed F64:$src),
+ (I32_TRUNC_S_F64 F64:$src)>;
+def : Pat<(int_wasm_trunc_unsigned F64:$src),
+ (I32_TRUNC_U_F64 F64:$src)>;
+def : Pat<(int_wasm_trunc_signed F32:$src),
+ (I64_TRUNC_S_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_unsigned F32:$src),
+ (I64_TRUNC_U_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_signed F64:$src),
+ (I64_TRUNC_S_F64 F64:$src)>;
+def : Pat<(int_wasm_trunc_unsigned F64:$src),
+ (I64_TRUNC_U_F64 F64:$src)>;
+
defm F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src), (outs), (ins),
[(set F32:$dst, (sint_to_fp I32:$src))],
"f32.convert_i32_s\t$dst, $src", "f32.convert_i32_s",
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index a86c9af28f0d..8e8126c90e72 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -38,7 +38,7 @@ WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
RI(STI.getTargetTriple()) {}
bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable(
- const MachineInstr &MI, AliasAnalysis *AA) const {
+ const MachineInstr &MI, AAResults *AA) const {
switch (MI.getOpcode()) {
case WebAssembly::CONST_I32:
case WebAssembly::CONST_I64:
@@ -60,7 +60,7 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// exist. However we need to handle both here.
auto &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterClass *RC =
- TargetRegisterInfo::isVirtualRegister(DestReg)
+ Register::isVirtualRegister(DestReg)
? MRI.getRegClass(DestReg)
: MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(DestReg);
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index df1051b4f42c..fe6211663c31 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -43,7 +43,7 @@ public:
const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; }
bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AliasAnalysis *AA) const override;
+ AAResults *AA) const override;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 73ddbe85d551..044901481381 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -106,7 +106,8 @@ def WebAssemblybr_table : SDNode<"WebAssemblyISD::BR_TABLE",
def WebAssemblyargument : SDNode<"WebAssemblyISD::ARGUMENT",
SDT_WebAssemblyArgument>;
def WebAssemblyreturn : SDNode<"WebAssemblyISD::RETURN",
- SDT_WebAssemblyReturn, [SDNPHasChain]>;
+ SDT_WebAssemblyReturn,
+ [SDNPHasChain, SDNPVariadic]>;
def WebAssemblywrapper : SDNode<"WebAssemblyISD::Wrapper",
SDT_WebAssemblyWrapper>;
def WebAssemblywrapperPIC : SDNode<"WebAssemblyISD::WrapperPIC",
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 6916b165f970..eba9b80d3286 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -37,16 +37,6 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
return (~Known0.Zero & ~Known1.Zero) == 0;
}]>;
-// GlobalAddresses are conceptually unsigned values, so we can also fold them
-// into immediate values as long as the add is 'nuw'.
-// TODO: We'd like to also match GA offsets but there are cases where the
-// register can have a negative value. Find out what more we can do.
-def regPlusGA : PatFrag<(ops node:$addr, node:$off),
- (add node:$addr, node:$off),
- [{
- return N->getFlags().hasNoUnsignedWrap();
-}]>;
-
// We don't need a regPlusES because external symbols never have constant
// offsets folded into them, so we can just use add.
@@ -93,15 +83,6 @@ def : LoadPatImmOff<i64, load, or_is_add, LOAD_I64>;
def : LoadPatImmOff<f32, load, or_is_add, LOAD_F32>;
def : LoadPatImmOff<f64, load, or_is_add, LOAD_F64>;
-class LoadPatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
- Pat<(ty (kind (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off)))),
- (inst 0, tglobaladdr:$off, I32:$addr)>, Requires<[IsNotPIC]>;
-
-def : LoadPatGlobalAddr<i32, load, LOAD_I32>;
-def : LoadPatGlobalAddr<i64, load, LOAD_I64>;
-def : LoadPatGlobalAddr<f32, load, LOAD_F32>;
-def : LoadPatGlobalAddr<f64, load, LOAD_F64>;
-
// Select loads with just a constant offset.
class LoadPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(ty (kind imm:$off)), (inst 0, imm:$off, (CONST_I32 0))>;
@@ -167,18 +148,6 @@ def : LoadPatImmOff<i64, zextloadi16, or_is_add, LOAD16_U_I64>;
def : LoadPatImmOff<i64, sextloadi32, or_is_add, LOAD32_S_I64>;
def : LoadPatImmOff<i64, zextloadi32, or_is_add, LOAD32_U_I64>;
-def : LoadPatGlobalAddr<i32, sextloadi8, LOAD8_S_I32>;
-def : LoadPatGlobalAddr<i32, zextloadi8, LOAD8_U_I32>;
-def : LoadPatGlobalAddr<i32, sextloadi16, LOAD16_S_I32>;
-def : LoadPatGlobalAddr<i32, zextloadi8, LOAD16_U_I32>;
-
-def : LoadPatGlobalAddr<i64, sextloadi8, LOAD8_S_I64>;
-def : LoadPatGlobalAddr<i64, zextloadi8, LOAD8_U_I64>;
-def : LoadPatGlobalAddr<i64, sextloadi16, LOAD16_S_I64>;
-def : LoadPatGlobalAddr<i64, zextloadi16, LOAD16_U_I64>;
-def : LoadPatGlobalAddr<i64, sextloadi32, LOAD32_S_I64>;
-def : LoadPatGlobalAddr<i64, zextloadi32, LOAD32_U_I64>;
-
// Select extending loads with just a constant offset.
def : LoadPatOffsetOnly<i32, sextloadi8, LOAD8_S_I32>;
def : LoadPatOffsetOnly<i32, zextloadi8, LOAD8_U_I32>;
@@ -224,11 +193,6 @@ def : LoadPatImmOff<i32, extloadi16, or_is_add, LOAD16_U_I32>;
def : LoadPatImmOff<i64, extloadi8, or_is_add, LOAD8_U_I64>;
def : LoadPatImmOff<i64, extloadi16, or_is_add, LOAD16_U_I64>;
def : LoadPatImmOff<i64, extloadi32, or_is_add, LOAD32_U_I64>;
-def : LoadPatGlobalAddr<i32, extloadi8, LOAD8_U_I32>;
-def : LoadPatGlobalAddr<i32, extloadi16, LOAD16_U_I32>;
-def : LoadPatGlobalAddr<i64, extloadi8, LOAD8_U_I64>;
-def : LoadPatGlobalAddr<i64, extloadi16, LOAD16_U_I64>;
-def : LoadPatGlobalAddr<i64, extloadi32, LOAD32_U_I64>;
// Select "don't care" extending loads with just a constant offset.
def : LoadPatOffsetOnly<i32, extloadi8, LOAD8_U_I32>;
@@ -282,15 +246,6 @@ def : StorePatImmOff<i64, store, or_is_add, STORE_I64>;
def : StorePatImmOff<f32, store, or_is_add, STORE_F32>;
def : StorePatImmOff<f64, store, or_is_add, STORE_F64>;
-class StorePatGlobalAddr<ValueType ty, PatFrag kind, NI inst> :
- Pat<(kind ty:$val,
- (regPlusGA I32:$addr, (WebAssemblywrapper tglobaladdr:$off))),
- (inst 0, tglobaladdr:$off, I32:$addr, ty:$val)>, Requires<[IsNotPIC]>;
-def : StorePatGlobalAddr<i32, store, STORE_I32>;
-def : StorePatGlobalAddr<i64, store, STORE_I64>;
-def : StorePatGlobalAddr<f32, store, STORE_F32>;
-def : StorePatGlobalAddr<f64, store, STORE_F64>;
-
// Select stores with just a constant offset.
class StorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
Pat<(kind ty:$val, imm:$off), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
@@ -333,12 +288,6 @@ def : StorePatImmOff<i64, truncstorei8, or_is_add, STORE8_I64>;
def : StorePatImmOff<i64, truncstorei16, or_is_add, STORE16_I64>;
def : StorePatImmOff<i64, truncstorei32, or_is_add, STORE32_I64>;
-def : StorePatGlobalAddr<i32, truncstorei8, STORE8_I32>;
-def : StorePatGlobalAddr<i32, truncstorei16, STORE16_I32>;
-def : StorePatGlobalAddr<i64, truncstorei8, STORE8_I64>;
-def : StorePatGlobalAddr<i64, truncstorei16, STORE16_I64>;
-def : StorePatGlobalAddr<i64, truncstorei32, STORE32_I64>;
-
// Select truncating stores with just a constant offset.
def : StorePatOffsetOnly<i32, truncstorei8, STORE8_I32>;
def : StorePatOffsetOnly<i32, truncstorei16, STORE16_I32>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index dd8930f079b0..fc5d73dac52e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -40,47 +40,124 @@ def LaneIdx#SIZE : ImmLeaf<i32, "return 0 <= Imm && Imm < "#SIZE#";">;
//===----------------------------------------------------------------------===//
// Load: v128.load
-multiclass SIMDLoad<ValueType vec_t> {
- let mayLoad = 1, UseNamedOperandTable = 1 in
- defm LOAD_#vec_t :
+let mayLoad = 1, UseNamedOperandTable = 1 in
+defm LOAD_V128 :
+ SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ "v128.load\t$dst, ${off}(${addr})$p2align",
+ "v128.load\t$off$p2align", 0>;
+
+// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
+foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
+def : LoadPatNoOffset<vec_t, load, LOAD_V128>;
+def : LoadPatImmOff<vec_t, load, regPlusImm, LOAD_V128>;
+def : LoadPatImmOff<vec_t, load, or_is_add, LOAD_V128>;
+def : LoadPatOffsetOnly<vec_t, load, LOAD_V128>;
+def : LoadPatGlobalAddrOffOnly<vec_t, load, LOAD_V128>;
+}
+
+// vNxM.load_splat
+multiclass SIMDLoadSplat<string vec, bits<32> simdop> {
+ let mayLoad = 1, UseNamedOperandTable = 1,
+ Predicates = [HasUnimplementedSIMD128] in
+ defm LOAD_SPLAT_#vec :
SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
(outs), (ins P2Align:$p2align, offset32_op:$off), [],
- "v128.load\t$dst, ${off}(${addr})$p2align",
- "v128.load\t$off$p2align", 0>;
+ vec#".load_splat\t$dst, ${off}(${addr})$p2align",
+ vec#".load_splat\t$off$p2align", simdop>;
}
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-defm "" : SIMDLoad<vec_t>;
-
-// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
-def : LoadPatNoOffset<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatImmOff<vec_t, load, regPlusImm, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatImmOff<vec_t, load, or_is_add, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatGlobalAddr<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatOffsetOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
-def : LoadPatGlobalAddrOffOnly<vec_t, load, !cast<NI>("LOAD_"#vec_t)>;
+defm "" : SIMDLoadSplat<"v8x16", 194>;
+defm "" : SIMDLoadSplat<"v16x8", 195>;
+defm "" : SIMDLoadSplat<"v32x4", 196>;
+defm "" : SIMDLoadSplat<"v64x2", 197>;
+
+def wasm_load_splat_t : SDTypeProfile<1, 1, []>;
+def wasm_load_splat : SDNode<"WebAssemblyISD::LOAD_SPLAT", wasm_load_splat_t>;
+
+foreach args = [["v16i8", "i32", "extloadi8"], ["v8i16", "i32", "extloadi16"],
+ ["v4i32", "i32", "load"], ["v2i64", "i64", "load"],
+ ["v4f32", "f32", "load"], ["v2f64", "f64", "load"]] in
+def load_splat_#args[0] :
+ PatFrag<(ops node:$addr), (wasm_load_splat
+ (!cast<ValueType>(args[1]) (!cast<PatFrag>(args[2]) node:$addr)))>;
+
+let Predicates = [HasUnimplementedSIMD128] in
+foreach args = [["v16i8", "v8x16"], ["v8i16", "v16x8"], ["v4i32", "v32x4"],
+ ["v2i64", "v64x2"], ["v4f32", "v32x4"], ["v2f64", "v64x2"]] in {
+def : LoadPatNoOffset<!cast<ValueType>(args[0]),
+ !cast<PatFrag>("load_splat_"#args[0]),
+ !cast<NI>("LOAD_SPLAT_"#args[1])>;
+def : LoadPatImmOff<!cast<ValueType>(args[0]),
+ !cast<PatFrag>("load_splat_"#args[0]),
+ regPlusImm,
+ !cast<NI>("LOAD_SPLAT_"#args[1])>;
+def : LoadPatImmOff<!cast<ValueType>(args[0]),
+ !cast<PatFrag>("load_splat_"#args[0]),
+ or_is_add,
+ !cast<NI>("LOAD_SPLAT_"#args[1])>;
+def : LoadPatOffsetOnly<!cast<ValueType>(args[0]),
+ !cast<PatFrag>("load_splat_"#args[0]),
+ !cast<NI>("LOAD_SPLAT_"#args[1])>;
+def : LoadPatGlobalAddrOffOnly<!cast<ValueType>(args[0]),
+ !cast<PatFrag>("load_splat_"#args[0]),
+ !cast<NI>("LOAD_SPLAT_"#args[1])>;
}
-// Store: v128.store
-multiclass SIMDStore<ValueType vec_t> {
- let mayStore = 1, UseNamedOperandTable = 1 in
- defm STORE_#vec_t :
- SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, V128:$vec),
+// Load and extend
+multiclass SIMDLoadExtend<ValueType vec_t, string name, bits<32> simdop> {
+ let mayLoad = 1, UseNamedOperandTable = 1,
+ Predicates = [HasUnimplementedSIMD128] in {
+ defm LOAD_EXTEND_S_#vec_t :
+ SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ name#"_s\t$dst, ${off}(${addr})$p2align",
+ name#"_s\t$off$p2align", simdop>;
+ defm LOAD_EXTEND_U_#vec_t :
+ SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
(outs), (ins P2Align:$p2align, offset32_op:$off), [],
- "v128.store\t${off}(${addr})$p2align, $vec",
- "v128.store\t$off$p2align", 1>;
+ name#"_u\t$dst, ${off}(${addr})$p2align",
+ name#"_u\t$off$p2align", !add(simdop, 1)>;
+ }
}
-foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-defm "" : SIMDStore<vec_t>;
+defm "" : SIMDLoadExtend<v8i16, "i16x8.load8x8", 210>;
+defm "" : SIMDLoadExtend<v4i32, "i32x4.load16x4", 212>;
+defm "" : SIMDLoadExtend<v2i64, "i64x2.load32x2", 214>;
+
+let Predicates = [HasUnimplementedSIMD128] in
+foreach types = [[v8i16, i8], [v4i32, i16], [v2i64, i32]] in
+foreach exts = [["sextloadv", "_S"],
+ ["zextloadv", "_U"],
+ ["extloadv", "_U"]] in {
+def : LoadPatNoOffset<types[0], !cast<PatFrag>(exts[0]#types[1]),
+ !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
+def : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), regPlusImm,
+ !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
+def : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), or_is_add,
+ !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
+def : LoadPatOffsetOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
+ !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
+def : LoadPatGlobalAddrOffOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
+ !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
+}
+
+
+// Store: v128.store
+let mayStore = 1, UseNamedOperandTable = 1 in
+defm STORE_V128 :
+ SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, V128:$vec),
+ (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+ "v128.store\t${off}(${addr})$p2align, $vec",
+ "v128.store\t$off$p2align", 1>;
+foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
// Def load and store patterns from WebAssemblyInstrMemory.td for vector types
-def : StorePatNoOffset<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatImmOff<vec_t, store, regPlusImm, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatImmOff<vec_t, store, or_is_add, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatGlobalAddr<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatOffsetOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
-def : StorePatGlobalAddrOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
+def : StorePatNoOffset<vec_t, store, STORE_V128>;
+def : StorePatImmOff<vec_t, store, regPlusImm, STORE_V128>;
+def : StorePatImmOff<vec_t, store, or_is_add, STORE_V128>;
+def : StorePatOffsetOnly<vec_t, store, STORE_V128>;
+def : StorePatGlobalAddrOffOnly<vec_t, store, STORE_V128>;
}
//===----------------------------------------------------------------------===//
@@ -90,7 +167,7 @@ def : StorePatGlobalAddrOffOnly<vec_t, store, !cast<NI>("STORE_"#vec_t)>;
// Constant: v128.const
multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> {
let isMoveImm = 1, isReMaterializable = 1,
- Predicates = [HasSIMD128, HasUnimplementedSIMD128] in
+ Predicates = [HasUnimplementedSIMD128] in
defm CONST_V128_#vec_t : SIMD_I<(outs V128:$dst), ops, (outs), ops,
[(set V128:$dst, (vec_t pat))],
"v128.const\t$dst, "#args,
@@ -198,6 +275,19 @@ def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
(i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF)))>;
}
+// Swizzle lanes: v8x16.swizzle
+def wasm_swizzle_t : SDTypeProfile<1, 2, []>;
+def wasm_swizzle : SDNode<"WebAssemblyISD::SWIZZLE", wasm_swizzle_t>;
+let Predicates = [HasUnimplementedSIMD128] in
+defm SWIZZLE :
+ SIMD_I<(outs V128:$dst), (ins V128:$src, V128:$mask), (outs), (ins),
+ [(set (v16i8 V128:$dst),
+ (wasm_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)))],
+ "v8x16.swizzle\t$dst, $src, $mask", "v8x16.swizzle", 192>;
+
+def : Pat<(int_wasm_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)),
+ (SWIZZLE V128:$src, V128:$mask)>;
+
// Create vector with identical lanes: splat
def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>;
def splat4 : PatFrag<(ops node:$x), (build_vector
@@ -286,7 +376,7 @@ multiclass ExtractLaneExtended<string sign, bits<32> baseInst> {
}
defm "" : ExtractLaneExtended<"_s", 5>;
-let Predicates = [HasSIMD128, HasUnimplementedSIMD128] in
+let Predicates = [HasUnimplementedSIMD128] in
defm "" : ExtractLaneExtended<"_u", 6>;
defm "" : ExtractLane<v4i32, "i32x4", LaneIdx4, I32, 13>;
defm "" : ExtractLane<v2i64, "i64x2", LaneIdx2, I64, 16>;
@@ -472,6 +562,11 @@ defm OR : SIMDBitwise<or, "or", 78>;
defm XOR : SIMDBitwise<xor, "xor", 79>;
} // isCommutable = 1
+// Bitwise logic: v128.andnot
+def andnot : PatFrag<(ops node:$left, node:$right), (and $left, (vnot $right))>;
+let Predicates = [HasUnimplementedSIMD128] in
+defm ANDNOT : SIMDBitwise<andnot, "andnot", 216>;
+
// Bitwise select: v128.bitselect
foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
defm BITSELECT_#vec_t :
@@ -655,7 +750,7 @@ defm ABS : SIMDUnaryFP<fabs, "abs", 149>;
defm NEG : SIMDUnaryFP<fneg, "neg", 150>;
// Square root: sqrt
-let Predicates = [HasSIMD128, HasUnimplementedSIMD128] in
+let Predicates = [HasUnimplementedSIMD128] in
defm SQRT : SIMDUnaryFP<fsqrt, "sqrt", 151>;
//===----------------------------------------------------------------------===//
@@ -679,7 +774,7 @@ let isCommutable = 1 in
defm MUL : SIMDBinaryFP<fmul, "mul", 156>;
// Division: div
-let Predicates = [HasSIMD128, HasUnimplementedSIMD128] in
+let Predicates = [HasUnimplementedSIMD128] in
defm DIV : SIMDBinaryFP<fdiv, "div", 157>;
// NaN-propagating minimum: min
@@ -712,6 +807,42 @@ defm "" : SIMDConvert<v4i32, v4f32, fp_to_uint, "i32x4.trunc_sat_f32x4_u", 172>;
defm "" : SIMDConvert<v2i64, v2f64, fp_to_sint, "i64x2.trunc_sat_f64x2_s", 173>;
defm "" : SIMDConvert<v2i64, v2f64, fp_to_uint, "i64x2.trunc_sat_f64x2_u", 174>;
+// Widening operations
+multiclass SIMDWiden<ValueType vec_t, string vec, ValueType arg_t, string arg,
+ bits<32> baseInst> {
+ defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_low_signed,
+ vec#".widen_low_"#arg#"_s", baseInst>;
+ defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_high_signed,
+ vec#".widen_high_"#arg#"_s", !add(baseInst, 1)>;
+ defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_low_unsigned,
+ vec#".widen_low_"#arg#"_u", !add(baseInst, 2)>;
+ defm "" : SIMDConvert<vec_t, arg_t, int_wasm_widen_high_unsigned,
+ vec#".widen_high_"#arg#"_u", !add(baseInst, 3)>;
+}
+
+defm "" : SIMDWiden<v8i16, "i16x8", v16i8, "i8x16", 202>;
+defm "" : SIMDWiden<v4i32, "i32x4", v8i16, "i16x8", 206>;
+
+// Narrowing operations
+multiclass SIMDNarrow<ValueType vec_t, string vec, ValueType arg_t, string arg,
+ bits<32> baseInst> {
+ defm NARROW_S_#vec_t :
+ SIMD_I<(outs V128:$dst), (ins V128:$low, V128:$high), (outs), (ins),
+ [(set (vec_t V128:$dst), (vec_t (int_wasm_narrow_signed
+ (arg_t V128:$low), (arg_t V128:$high))))],
+ vec#".narrow_"#arg#"_s\t$dst, $low, $high", vec#".narrow_"#arg#"_s",
+ baseInst>;
+ defm NARROW_U_#vec_t :
+ SIMD_I<(outs V128:$dst), (ins V128:$low, V128:$high), (outs), (ins),
+ [(set (vec_t V128:$dst), (vec_t (int_wasm_narrow_unsigned
+ (arg_t V128:$low), (arg_t V128:$high))))],
+ vec#".narrow_"#arg#"_u\t$dst, $low, $high", vec#".narrow_"#arg#"_u",
+ !add(baseInst, 1)>;
+}
+
+defm "" : SIMDNarrow<v16i8, "i8x16", v8i16, "i16x8", 198>;
+defm "" : SIMDNarrow<v8i16, "i16x8", v4i32, "i32x4", 200>;
+
// Lower llvm.wasm.trunc.saturate.* to saturating instructions
def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
(fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>;
@@ -732,3 +863,25 @@ foreach t2 = !foldl(
)
) in
def : Pat<(t1 (bitconvert (t2 V128:$v))), (t1 V128:$v)>;
+
+//===----------------------------------------------------------------------===//
+// Quasi-Fused Multiply- Add and Subtract (QFMA/QFMS)
+//===----------------------------------------------------------------------===//
+
+multiclass SIMDQFM<ValueType vec_t, string vec, bits<32> baseInst> {
+ defm QFMA_#vec_t :
+ SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c),
+ (outs), (ins),
+ [(set (vec_t V128:$dst),
+ (int_wasm_qfma (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))],
+ vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", baseInst>;
+ defm QFMS_#vec_t :
+ SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c),
+ (outs), (ins),
+ [(set (vec_t V128:$dst),
+ (int_wasm_qfms (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))],
+ vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", !add(baseInst, 1)>;
+}
+
+defm "" : SIMDQFM<v4f32, "f32x4", 0x98>;
+defm "" : SIMDQFM<v2f64, "f64x2", 0xa3>;
diff --git a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index e92b34430272..75d04252cbe9 100644
--- a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -19,6 +19,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/WasmEHFuncInfo.h"
#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
using namespace llvm;
#define DEBUG_TYPE "wasm-late-eh-prepare"
@@ -131,7 +132,7 @@ bool WebAssemblyLateEHPrepare::addCatches(MachineFunction &MF) {
auto InsertPos = MBB.begin();
if (InsertPos->isEHLabel()) // EH pad starts with an EH label
++InsertPos;
- unsigned DstReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
+ Register DstReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass);
BuildMI(MBB, InsertPos, MBB.begin()->getDebugLoc(),
TII.get(WebAssembly::CATCH), DstReg);
}
@@ -168,7 +169,7 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) {
if (CatchPos->isEHLabel()) // EH pad starts with an EH label
++CatchPos;
MachineInstr *Catch = &*CatchPos;
- unsigned ExnReg = Catch->getOperand(0).getReg();
+ Register ExnReg = Catch->getOperand(0).getReg();
BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW))
.addReg(ExnReg);
TI->eraseFromParent();
@@ -233,6 +234,7 @@ bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables(
// it. The pseudo instruction will be deleted later.
bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) {
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
auto *EHInfo = MF.getWasmEHFuncInfo();
SmallVector<MachineInstr *, 16> ExtractInstrs;
SmallVector<MachineInstr *, 8> ToDelete;
@@ -292,7 +294,7 @@ bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) {
// thenbb:
// %exn:i32 = extract_exception
// ... use exn ...
- unsigned ExnReg = Catch->getOperand(0).getReg();
+ Register ExnReg = Catch->getOperand(0).getReg();
auto *ThenMBB = MF.CreateMachineBasicBlock();
auto *ElseMBB = MF.CreateMachineBasicBlock();
MF.insert(std::next(MachineFunction::iterator(EHPad)), ElseMBB);
@@ -339,9 +341,11 @@ bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) {
WebAssembly::ClangCallTerminateFn);
assert(ClangCallTerminateFn &&
"There is no __clang_call_terminate() function");
+ Register Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ BuildMI(ElseMBB, DL, TII.get(WebAssembly::CONST_I32), Reg).addImm(0);
BuildMI(ElseMBB, DL, TII.get(WebAssembly::CALL_VOID))
.addGlobalAddress(ClangCallTerminateFn)
- .addImm(0);
+ .addReg(Reg);
BuildMI(ElseMBB, DL, TII.get(WebAssembly::UNREACHABLE));
} else {
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
index 34a8195ac4b4..4314aa611549 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -68,7 +68,7 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
if (MI->getOpcode() != WebAssembly::BR_UNLESS)
continue;
- unsigned Cond = MI->getOperand(1).getReg();
+ Register Cond = MI->getOperand(1).getReg();
bool Inverted = false;
// Attempt to invert the condition in place.
@@ -188,7 +188,7 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
// If we weren't able to invert the condition in place. Insert an
// instruction to invert it.
if (!Inverted) {
- unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ Register Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQZ_I32), Tmp)
.addReg(Cond);
MFI.stackifyVReg(Tmp);
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 960d5134f6e9..1cf397dd060b 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -227,15 +227,6 @@ static cl::list<std::string>
namespace {
class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
- static const char *ResumeFName;
- static const char *EHTypeIDFName;
- static const char *EmLongjmpFName;
- static const char *EmLongjmpJmpbufFName;
- static const char *SaveSetjmpFName;
- static const char *TestSetjmpFName;
- static const char *FindMatchingCatchPrefix;
- static const char *InvokePrefix;
-
bool EnableEH; // Enable exception handling
bool EnableSjLj; // Enable setjmp/longjmp handling
@@ -274,6 +265,7 @@ class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
bool areAllExceptionsAllowed() const { return EHWhitelistSet.empty(); }
bool canLongjmp(Module &M, const Value *Callee) const;
+ bool isEmAsmCall(Module &M, const Value *Callee) const;
void rebuildSSA(Function &F);
@@ -292,19 +284,6 @@ public:
};
} // End anonymous namespace
-const char *WebAssemblyLowerEmscriptenEHSjLj::ResumeFName = "__resumeException";
-const char *WebAssemblyLowerEmscriptenEHSjLj::EHTypeIDFName =
- "llvm_eh_typeid_for";
-const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpFName =
- "emscripten_longjmp";
-const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpJmpbufFName =
- "emscripten_longjmp_jmpbuf";
-const char *WebAssemblyLowerEmscriptenEHSjLj::SaveSetjmpFName = "saveSetjmp";
-const char *WebAssemblyLowerEmscriptenEHSjLj::TestSetjmpFName = "testSetjmp";
-const char *WebAssemblyLowerEmscriptenEHSjLj::FindMatchingCatchPrefix =
- "__cxa_find_matching_catch_";
-const char *WebAssemblyLowerEmscriptenEHSjLj::InvokePrefix = "__invoke_";
-
char WebAssemblyLowerEmscriptenEHSjLj::ID = 0;
INITIALIZE_PASS(WebAssemblyLowerEmscriptenEHSjLj, DEBUG_TYPE,
"WebAssembly Lower Emscripten Exceptions / Setjmp / Longjmp",
@@ -335,7 +314,8 @@ static bool canThrow(const Value *V) {
static GlobalVariable *getGlobalVariableI32(Module &M, IRBuilder<> &IRB,
const char *Name) {
- auto* GV = dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, IRB.getInt32Ty()));
+ auto *GV =
+ dyn_cast<GlobalVariable>(M.getOrInsertGlobal(Name, IRB.getInt32Ty()));
if (!GV)
report_fatal_error(Twine("unable to create global: ") + Name);
@@ -376,9 +356,9 @@ WebAssemblyLowerEmscriptenEHSjLj::getFindMatchingCatch(Module &M,
PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
SmallVector<Type *, 16> Args(NumClauses, Int8PtrTy);
FunctionType *FTy = FunctionType::get(Int8PtrTy, Args, false);
- Function *F =
- Function::Create(FTy, GlobalValue::ExternalLinkage,
- FindMatchingCatchPrefix + Twine(NumClauses + 2), &M);
+ Function *F = Function::Create(
+ FTy, GlobalValue::ExternalLinkage,
+ "__cxa_find_matching_catch_" + Twine(NumClauses + 2), &M);
FindMatchingCatches[NumClauses] = F;
return F;
}
@@ -418,7 +398,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
Args.append(CI->arg_begin(), CI->arg_end());
CallInst *NewCall = IRB.CreateCall(getInvokeWrapper(CI), Args);
NewCall->takeName(CI);
- NewCall->setCallingConv(CI->getCallingConv());
+ NewCall->setCallingConv(CallingConv::WASM_EmscriptenInvoke);
NewCall->setDebugLoc(CI->getDebugLoc());
// Because we added the pointer to the callee as first argument, all
@@ -432,9 +412,22 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
for (unsigned I = 0, E = CI->getNumArgOperands(); I < E; ++I)
ArgAttributes.push_back(InvokeAL.getParamAttributes(I));
+ AttrBuilder FnAttrs(InvokeAL.getFnAttributes());
+ if (FnAttrs.contains(Attribute::AllocSize)) {
+ // The allocsize attribute (if any) referes to parameters by index and needs
+ // to be adjusted.
+ unsigned SizeArg;
+ Optional<unsigned> NEltArg;
+ std::tie(SizeArg, NEltArg) = FnAttrs.getAllocSizeArgs();
+ SizeArg += 1;
+ if (NEltArg.hasValue())
+ NEltArg = NEltArg.getValue() + 1;
+ FnAttrs.addAllocSizeAttr(SizeArg, NEltArg);
+ }
+
// Reconstruct the AttributesList based on the vector we constructed.
AttributeList NewCallAL =
- AttributeList::get(C, InvokeAL.getFnAttributes(),
+ AttributeList::get(C, AttributeSet::get(C, FnAttrs),
InvokeAL.getRetAttributes(), ArgAttributes);
NewCall->setAttributes(NewCallAL);
@@ -473,8 +466,8 @@ Function *WebAssemblyLowerEmscriptenEHSjLj::getInvokeWrapper(CallOrInvoke *CI) {
FunctionType *FTy = FunctionType::get(CalleeFTy->getReturnType(), ArgTys,
CalleeFTy->isVarArg());
- Function *F = Function::Create(FTy, GlobalValue::ExternalLinkage,
- InvokePrefix + Sig, M);
+ Function *F =
+ Function::Create(FTy, GlobalValue::ExternalLinkage, "__invoke_" + Sig, M);
InvokeWrappers[Sig] = F;
return F;
}
@@ -491,39 +484,44 @@ bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M,
// and can't be passed by pointer. The result is a crash with illegal IR.
if (isa<InlineAsm>(Callee))
return false;
+ StringRef CalleeName = Callee->getName();
// The reason we include malloc/free here is to exclude the malloc/free
// calls generated in setjmp prep / cleanup routines.
- Function *SetjmpF = M.getFunction("setjmp");
- Function *MallocF = M.getFunction("malloc");
- Function *FreeF = M.getFunction("free");
- if (Callee == SetjmpF || Callee == MallocF || Callee == FreeF)
+ if (CalleeName == "setjmp" || CalleeName == "malloc" || CalleeName == "free")
return false;
// There are functions in JS glue code
- if (Callee == ResumeF || Callee == EHTypeIDF || Callee == SaveSetjmpF ||
- Callee == TestSetjmpF)
+ if (CalleeName == "__resumeException" || CalleeName == "llvm_eh_typeid_for" ||
+ CalleeName == "saveSetjmp" || CalleeName == "testSetjmp" ||
+ CalleeName == "getTempRet0" || CalleeName == "setTempRet0")
return false;
// __cxa_find_matching_catch_N functions cannot longjmp
- if (Callee->getName().startswith(FindMatchingCatchPrefix))
+ if (Callee->getName().startswith("__cxa_find_matching_catch_"))
return false;
// Exception-catching related functions
- Function *BeginCatchF = M.getFunction("__cxa_begin_catch");
- Function *EndCatchF = M.getFunction("__cxa_end_catch");
- Function *AllocExceptionF = M.getFunction("__cxa_allocate_exception");
- Function *ThrowF = M.getFunction("__cxa_throw");
- Function *TerminateF = M.getFunction("__clang_call_terminate");
- if (Callee == BeginCatchF || Callee == EndCatchF ||
- Callee == AllocExceptionF || Callee == ThrowF || Callee == TerminateF ||
- Callee == GetTempRet0Func || Callee == SetTempRet0Func)
+ if (CalleeName == "__cxa_begin_catch" || CalleeName == "__cxa_end_catch" ||
+ CalleeName == "__cxa_allocate_exception" || CalleeName == "__cxa_throw" ||
+ CalleeName == "__clang_call_terminate")
return false;
// Otherwise we don't know
return true;
}
+bool WebAssemblyLowerEmscriptenEHSjLj::isEmAsmCall(Module &M,
+ const Value *Callee) const {
+ StringRef CalleeName = Callee->getName();
+ // This is an exhaustive list from Emscripten's <emscripten/em_asm.h>.
+ return CalleeName == "emscripten_asm_const_int" ||
+ CalleeName == "emscripten_asm_const_double" ||
+ CalleeName == "emscripten_asm_const_int_sync_on_main_thread" ||
+ CalleeName == "emscripten_asm_const_double_sync_on_main_thread" ||
+ CalleeName == "emscripten_asm_const_async_on_main_thread";
+}
+
// Generate testSetjmp function call seqence with preamble and postamble.
// The code this generates is equivalent to the following JavaScript code:
// if (%__THREW__.val != 0 & threwValue != 0) {
@@ -605,15 +603,12 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
SSAUpdater SSA;
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
+ SSA.Initialize(I.getType(), I.getName());
+ SSA.AddAvailableValue(&BB, &I);
for (auto UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
Use &U = *UI;
++UI;
- SSA.Initialize(I.getType(), I.getName());
- SSA.AddAvailableValue(&BB, &I);
auto *User = cast<Instruction>(U.getUser());
- if (User->getParent() == &BB)
- continue;
-
if (auto *UserPN = dyn_cast<PHINode>(User))
if (UserPN->getIncomingBlock(U) == &BB)
continue;
@@ -660,13 +655,13 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
FunctionType *ResumeFTy =
FunctionType::get(IRB.getVoidTy(), IRB.getInt8PtrTy(), false);
ResumeF = Function::Create(ResumeFTy, GlobalValue::ExternalLinkage,
- ResumeFName, &M);
+ "__resumeException", &M);
// Register llvm_eh_typeid_for function
FunctionType *EHTypeIDTy =
FunctionType::get(IRB.getInt32Ty(), IRB.getInt8PtrTy(), false);
EHTypeIDF = Function::Create(EHTypeIDTy, GlobalValue::ExternalLinkage,
- EHTypeIDFName, &M);
+ "llvm_eh_typeid_for", &M);
for (Function &F : M) {
if (F.isDeclaration())
@@ -684,7 +679,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
// defined in JS code
EmLongjmpJmpbufF = Function::Create(LongjmpF->getFunctionType(),
GlobalValue::ExternalLinkage,
- EmLongjmpJmpbufFName, &M);
+ "emscripten_longjmp_jmpbuf", &M);
LongjmpF->replaceAllUsesWith(EmLongjmpJmpbufF);
}
@@ -697,19 +692,19 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
IRB.getInt32Ty()};
FunctionType *FTy =
FunctionType::get(Type::getInt32PtrTy(C), Params, false);
- SaveSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
- SaveSetjmpFName, &M);
+ SaveSetjmpF =
+ Function::Create(FTy, GlobalValue::ExternalLinkage, "saveSetjmp", &M);
// Register testSetjmp function
Params = {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()};
FTy = FunctionType::get(IRB.getInt32Ty(), Params, false);
- TestSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
- TestSetjmpFName, &M);
+ TestSetjmpF =
+ Function::Create(FTy, GlobalValue::ExternalLinkage, "testSetjmp", &M);
FTy = FunctionType::get(IRB.getVoidTy(),
{IRB.getInt32Ty(), IRB.getInt32Ty()}, false);
EmLongjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
- EmLongjmpFName, &M);
+ "emscripten_longjmp", &M);
// Only traverse functions that uses setjmp in order not to insert
// unnecessary prep / cleanup code in every function
@@ -970,10 +965,16 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
const Value *Callee = CI->getCalledValue();
if (!canLongjmp(M, Callee))
continue;
+ if (isEmAsmCall(M, Callee))
+ report_fatal_error("Cannot use EM_ASM* alongside setjmp/longjmp in " +
+ F.getName() +
+ ". Please consider using EM_JS, or move the "
+ "EM_ASM into another function.",
+ false);
Value *Threw = nullptr;
BasicBlock *Tail;
- if (Callee->getName().startswith(InvokePrefix)) {
+ if (Callee->getName().startswith("__invoke_")) {
// If invoke wrapper has already been generated for this call in
// previous EH phase, search for the load instruction
// %__THREW__.val = __THREW__;
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp b/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
index 494d3fadbc8c..750b2233e67a 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
@@ -94,7 +94,7 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
break; // Found a null terminator, skip the rest.
Constant *Associated = CS->getOperand(2);
- Associated = cast<Constant>(Associated->stripPointerCastsNoFollowAliases());
+ Associated = cast<Constant>(Associated->stripPointerCasts());
DtorFuncs[PriorityValue][Associated].push_back(DtorFunc);
}
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 288b991ae2c5..59c10243c545 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -79,7 +79,7 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
// Clang-provided symbols.
if (strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0 ||
strcmp(Name, "__memory_base") == 0 || strcmp(Name, "__table_base") == 0 ||
- strcmp(Name, "__tls_size") == 0) {
+ strcmp(Name, "__tls_size") == 0 || strcmp(Name, "__tls_align") == 0) {
bool Mutable =
strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0;
WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
@@ -115,7 +115,7 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
getLibcallSignature(Subtarget, Name, Returns, Params);
}
auto Signature =
- make_unique<wasm::WasmSignature>(std::move(Returns), std::move(Params));
+ std::make_unique<wasm::WasmSignature>(std::move(Returns), std::move(Params));
WasmSym->setSignature(Signature.get());
Printer.addSignature(std::move(Signature));
@@ -163,6 +163,21 @@ MCOperand WebAssemblyMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
return MCOperand::createExpr(Expr);
}
+MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand(
+ SmallVector<wasm::ValType, 1> &&Returns,
+ SmallVector<wasm::ValType, 4> &&Params) const {
+ auto Signature = std::make_unique<wasm::WasmSignature>(std::move(Returns),
+ std::move(Params));
+ MCSymbol *Sym = Printer.createTempSymbol("typeindex");
+ auto *WasmSym = cast<MCSymbolWasm>(Sym);
+ WasmSym->setSignature(Signature.get());
+ Printer.addSignature(std::move(Signature));
+ WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+ const MCExpr *Expr =
+ MCSymbolRefExpr::create(WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx);
+ return MCOperand::createExpr(Expr);
+}
+
// Return the WebAssembly type associated with the given register class.
static wasm::ValType getType(const TargetRegisterClass *RC) {
if (RC == &WebAssembly::I32RegClass)
@@ -178,6 +193,16 @@ static wasm::ValType getType(const TargetRegisterClass *RC) {
llvm_unreachable("Unexpected register class");
}
+static void getFunctionReturns(const MachineInstr *MI,
+ SmallVectorImpl<wasm::ValType> &Returns) {
+ const Function &F = MI->getMF()->getFunction();
+ const TargetMachine &TM = MI->getMF()->getTarget();
+ Type *RetTy = F.getReturnType();
+ SmallVector<MVT, 4> CallerRetTys;
+ computeLegalValueVTs(F, TM, RetTy, CallerRetTys);
+ valTypesFromMVTs(CallerRetTys, Returns);
+}
+
void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
MCInst &OutMI) const {
OutMI.setOpcode(MI->getOpcode());
@@ -208,8 +233,6 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
if (I < Desc.NumOperands) {
const MCOperandInfo &Info = Desc.OpInfo[I];
if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
- MCSymbol *Sym = Printer.createTempSymbol("typeindex");
-
SmallVector<wasm::ValType, 4> Returns;
SmallVector<wasm::ValType, 4> Params;
@@ -226,17 +249,23 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
if (WebAssembly::isCallIndirect(MI->getOpcode()))
Params.pop_back();
- auto *WasmSym = cast<MCSymbolWasm>(Sym);
- auto Signature = make_unique<wasm::WasmSignature>(std::move(Returns),
- std::move(Params));
- WasmSym->setSignature(Signature.get());
- Printer.addSignature(std::move(Signature));
- WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+ // return_call_indirect instructions have the return type of the
+ // caller
+ if (MI->getOpcode() == WebAssembly::RET_CALL_INDIRECT)
+ getFunctionReturns(MI, Returns);
- const MCExpr *Expr = MCSymbolRefExpr::create(
- WasmSym, MCSymbolRefExpr::VK_WASM_TYPEINDEX, Ctx);
- MCOp = MCOperand::createExpr(Expr);
+ MCOp = lowerTypeIndexOperand(std::move(Returns), std::move(Params));
break;
+ } else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
+ auto BT = static_cast<WebAssembly::BlockType>(MO.getImm());
+ assert(BT != WebAssembly::BlockType::Invalid);
+ if (BT == WebAssembly::BlockType::Multivalue) {
+ SmallVector<wasm::ValType, 1> Returns;
+ getFunctionReturns(MI, Returns);
+ MCOp = lowerTypeIndexOperand(std::move(Returns),
+ SmallVector<wasm::ValType, 4>());
+ break;
+ }
}
}
MCOp = MCOperand::createImm(MO.getImm());
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index 2c375a01a7f5..d79c54097eb7 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -15,6 +15,7 @@
#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H
#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H
+#include "llvm/BinaryFormat/Wasm.h"
#include "llvm/MC/MCInst.h"
#include "llvm/Support/Compiler.h"
@@ -33,6 +34,8 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+ MCOperand lowerTypeIndexOperand(SmallVector<wasm::ValType, 1> &&,
+ SmallVector<wasm::ValType, 4> &&) const;
public:
WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer)
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index d31c1226bfdb..e4cc2389147b 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -49,10 +49,12 @@ void llvm::computeSignatureVTs(const FunctionType *Ty, const Function &F,
computeLegalValueVTs(F, TM, Ty->getReturnType(), Results);
MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
- if (Results.size() > 1) {
- // WebAssembly currently can't lower returns of multiple values without
- // demoting to sret (see WebAssemblyTargetLowering::CanLowerReturn). So
- // replace multiple return values with a pointer parameter.
+ if (Results.size() > 1 &&
+ !TM.getSubtarget<WebAssemblySubtarget>(F).hasMultivalue()) {
+ // WebAssembly can't lower returns of multiple values without demoting to
+ // sret unless multivalue is enabled (see
+ // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return
+ // values with a poitner parameter.
Results.clear();
Params.push_back(PtrVT);
}
@@ -72,7 +74,7 @@ void llvm::valTypesFromMVTs(const ArrayRef<MVT> &In,
std::unique_ptr<wasm::WasmSignature>
llvm::signatureFromMVTs(const SmallVectorImpl<MVT> &Results,
const SmallVectorImpl<MVT> &Params) {
- auto Sig = make_unique<wasm::WasmSignature>();
+ auto Sig = std::make_unique<wasm::WasmSignature>();
valTypesFromMVTs(Results, Sig->Returns);
valTypesFromMVTs(Params, Sig->Params);
return Sig;
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 4b9ba491dee6..16e2f4392984 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -96,13 +96,18 @@ public:
void stackifyVReg(unsigned VReg) {
assert(MF.getRegInfo().getUniqueVRegDef(VReg));
- auto I = TargetRegisterInfo::virtReg2Index(VReg);
+ auto I = Register::virtReg2Index(VReg);
if (I >= VRegStackified.size())
VRegStackified.resize(I + 1);
VRegStackified.set(I);
}
+ void unstackifyVReg(unsigned VReg) {
+ auto I = Register::virtReg2Index(VReg);
+ if (I < VRegStackified.size())
+ VRegStackified.reset(I);
+ }
bool isVRegStackified(unsigned VReg) const {
- auto I = TargetRegisterInfo::virtReg2Index(VReg);
+ auto I = Register::virtReg2Index(VReg);
if (I >= VRegStackified.size())
return false;
return VRegStackified.test(I);
@@ -111,12 +116,12 @@ public:
void initWARegs();
void setWAReg(unsigned VReg, unsigned WAReg) {
assert(WAReg != UnusedReg);
- auto I = TargetRegisterInfo::virtReg2Index(VReg);
+ auto I = Register::virtReg2Index(VReg);
assert(I < WARegs.size());
WARegs[I] = WAReg;
}
unsigned getWAReg(unsigned VReg) const {
- auto I = TargetRegisterInfo::virtReg2Index(VReg);
+ auto I = Register::virtReg2Index(VReg);
assert(I < WARegs.size());
return WARegs[I];
}
diff --git a/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp b/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
index 7ac0511c28b0..ac428fcc826a 100644
--- a/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
@@ -166,8 +166,8 @@ static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
if (!LibInfo.getLibFunc(Name, Func))
return false;
- unsigned FromReg = MI.getOperand(2).getReg();
- unsigned ToReg = MI.getOperand(0).getReg();
+ Register FromReg = MI.getOperand(2).getReg();
+ Register ToReg = MI.getOperand(0).getReg();
if (MRI.getRegClass(FromReg) != MRI.getRegClass(ToReg))
report_fatal_error("Memory Intrinsic results: call to builtin function "
"with wrong signature, from/to mismatch");
@@ -184,7 +184,8 @@ bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) {
auto &MDT = getAnalysis<MachineDominatorTree>();
const WebAssemblyTargetLowering &TLI =
*MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
- const auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ const auto &LibInfo =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(MF.getFunction());
auto &LIS = getAnalysis<LiveIntervals>();
bool Changed = false;
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
index 8c7c3305c201..0bd30791e57c 100644
--- a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -81,7 +81,7 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(
// Split multiple-VN LiveIntervals into multiple LiveIntervals.
SmallVector<LiveInterval *, 4> SplitLIs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
- unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+ unsigned Reg = Register::index2VirtReg(I);
if (MRI.reg_nodbg_empty(Reg))
continue;
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index d20352259e07..9b60596e42b4 100644
--- a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -64,11 +64,8 @@ void OptimizeReturned::visitCallSite(CallSite CS) {
if (isa<Constant>(Arg))
continue;
// Like replaceDominatedUsesWith but using Instruction/Use dominance.
- for (auto UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE;) {
- Use &U = *UI++;
- if (DT->dominates(Inst, U))
- U.set(Inst);
- }
+ Arg->replaceUsesWithIf(Inst,
+ [&](Use &U) { return DT->dominates(Inst, U); });
}
}
diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index e11cdeaa0e79..ea6cd09a604c 100644
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -63,7 +63,7 @@ static bool maybeRewriteToDrop(unsigned OldReg, unsigned NewReg,
bool Changed = false;
if (OldReg == NewReg) {
Changed = true;
- unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+ Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
MO.setReg(NewReg);
MO.setIsDead();
MFI.stackifyVReg(NewReg);
@@ -75,9 +75,7 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
const MachineFunction &MF,
WebAssemblyFunctionInfo &MFI,
MachineRegisterInfo &MRI,
- const WebAssemblyInstrInfo &TII,
- unsigned FallthroughOpc,
- unsigned CopyLocalOpc) {
+ const WebAssemblyInstrInfo &TII) {
if (DisableWebAssemblyFallthroughReturnOpt)
return false;
if (&MBB != &MF.back())
@@ -90,13 +88,36 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
if (&MI != &*End)
return false;
- if (FallthroughOpc != WebAssembly::FALLTHROUGH_RETURN_VOID) {
- // If the operand isn't stackified, insert a COPY to read the operand and
- // stackify it.
- MachineOperand &MO = MI.getOperand(0);
- unsigned Reg = MO.getReg();
+ for (auto &MO : MI.explicit_operands()) {
+ // If the operand isn't stackified, insert a COPY to read the operands and
+ // stackify them.
+ Register Reg = MO.getReg();
if (!MFI.isVRegStackified(Reg)) {
- unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+ unsigned CopyLocalOpc;
+ const TargetRegisterClass *RegClass = MRI.getRegClass(Reg);
+ switch (RegClass->getID()) {
+ case WebAssembly::I32RegClassID:
+ CopyLocalOpc = WebAssembly::COPY_I32;
+ break;
+ case WebAssembly::I64RegClassID:
+ CopyLocalOpc = WebAssembly::COPY_I64;
+ break;
+ case WebAssembly::F32RegClassID:
+ CopyLocalOpc = WebAssembly::COPY_F32;
+ break;
+ case WebAssembly::F64RegClassID:
+ CopyLocalOpc = WebAssembly::COPY_F64;
+ break;
+ case WebAssembly::V128RegClassID:
+ CopyLocalOpc = WebAssembly::COPY_V128;
+ break;
+ case WebAssembly::EXNREFRegClassID:
+ CopyLocalOpc = WebAssembly::COPY_EXNREF;
+ break;
+ default:
+ llvm_unreachable("Unexpected register class for return operand");
+ }
+ Register NewReg = MRI.createVirtualRegister(RegClass);
BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
.addReg(Reg);
MO.setReg(NewReg);
@@ -104,8 +125,7 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
}
}
- // Rewrite the return.
- MI.setDesc(TII.get(FallthroughOpc));
+ MI.setDesc(TII.get(WebAssembly::FALLTHROUGH_RETURN));
return true;
}
@@ -120,7 +140,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
const WebAssemblyTargetLowering &TLI =
*MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
- auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto &LibInfo =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(MF.getFunction());
bool Changed = false;
for (auto &MBB : MF)
@@ -143,8 +164,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
report_fatal_error("Peephole: call to builtin function with "
"wrong signature, not consuming reg");
MachineOperand &MO = MI.getOperand(0);
- unsigned OldReg = MO.getReg();
- unsigned NewReg = Op2.getReg();
+ Register OldReg = MO.getReg();
+ Register NewReg = Op2.getReg();
if (MRI.getRegClass(NewReg) != MRI.getRegClass(OldReg))
report_fatal_error("Peephole: call to builtin function with "
@@ -156,60 +177,8 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
break;
}
// Optimize away an explicit void return at the end of the function.
- case WebAssembly::RETURN_I32:
- Changed |= maybeRewriteToFallthrough(
- MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I32,
- WebAssembly::COPY_I32);
- break;
- case WebAssembly::RETURN_I64:
- Changed |= maybeRewriteToFallthrough(
- MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I64,
- WebAssembly::COPY_I64);
- break;
- case WebAssembly::RETURN_F32:
- Changed |= maybeRewriteToFallthrough(
- MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F32,
- WebAssembly::COPY_F32);
- break;
- case WebAssembly::RETURN_F64:
- Changed |= maybeRewriteToFallthrough(
- MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F64,
- WebAssembly::COPY_F64);
- break;
- case WebAssembly::RETURN_v16i8:
- Changed |= maybeRewriteToFallthrough(
- MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v16i8,
- WebAssembly::COPY_V128);
- break;
- case WebAssembly::RETURN_v8i16:
- Changed |= maybeRewriteToFallthrough(
- MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v8i16,
- WebAssembly::COPY_V128);
- break;
- case WebAssembly::RETURN_v4i32:
- Changed |= maybeRewriteToFallthrough(
- MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4i32,
- WebAssembly::COPY_V128);
- break;
- case WebAssembly::RETURN_v2i64:
- Changed |= maybeRewriteToFallthrough(
- MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v2i64,
- WebAssembly::COPY_V128);
- break;
- case WebAssembly::RETURN_v4f32:
- Changed |= maybeRewriteToFallthrough(
- MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4f32,
- WebAssembly::COPY_V128);
- break;
- case WebAssembly::RETURN_v2f64:
- Changed |= maybeRewriteToFallthrough(
- MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v2f64,
- WebAssembly::COPY_V128);
- break;
- case WebAssembly::RETURN_VOID:
- Changed |= maybeRewriteToFallthrough(
- MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_VOID,
- WebAssembly::INSTRUCTION_LIST_END);
+ case WebAssembly::RETURN:
+ Changed |= maybeRewriteToFallthrough(MI, MBB, MF, MFI, MRI, TII);
break;
}
diff --git a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
index 3bfbf607344d..799b9388097c 100644
--- a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -95,7 +95,7 @@ bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(
// TODO: This is fairly heavy-handed; find a better approach.
//
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
- unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+ unsigned Reg = Register::index2VirtReg(I);
// Skip unused registers.
if (MRI.use_nodbg_empty(Reg))
diff --git a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index 6f09c45b6642..043b6f1b7d18 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -98,7 +98,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "Interesting register intervals:\n");
for (unsigned I = 0; I < NumVRegs; ++I) {
- unsigned VReg = TargetRegisterInfo::index2VirtReg(I);
+ unsigned VReg = Register::index2VirtReg(I);
if (MFI.isVRegStackified(VReg))
continue;
// Skip unused registers, which can use $drop.
@@ -157,9 +157,8 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
Changed |= Old != New;
UsedColors.set(Color);
Assignments[Color].push_back(LI);
- LLVM_DEBUG(
- dbgs() << "Assigning vreg" << TargetRegisterInfo::virtReg2Index(LI->reg)
- << " to vreg" << TargetRegisterInfo::virtReg2Index(New) << "\n");
+ LLVM_DEBUG(dbgs() << "Assigning vreg" << Register::virtReg2Index(LI->reg)
+ << " to vreg" << Register::virtReg2Index(New) << "\n");
}
if (!Changed)
return false;
diff --git a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index cdca23f55b29..72e7a7cf5042 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -89,7 +89,7 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
// Start the numbering for locals after the arg regs
unsigned CurReg = MFI.getParams().size();
for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) {
- unsigned VReg = TargetRegisterInfo::index2VirtReg(VRegIdx);
+ unsigned VReg = Register::index2VirtReg(VRegIdx);
// Skip unused registers.
if (MRI.use_empty(VReg))
continue;
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index a120a6471014..421d353a89e8 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -120,7 +120,7 @@ static void convertImplicitDefToConstZero(MachineInstr *MI,
Type::getDoubleTy(MF.getFunction().getContext())));
MI->addOperand(MachineOperand::CreateFPImm(Val));
} else if (RegClass == &WebAssembly::V128RegClass) {
- unsigned TempReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+ Register TempReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
MI->setDesc(TII->get(WebAssembly::SPLAT_v4i32));
MI->addOperand(MachineOperand::CreateReg(TempReg, false));
MachineInstr *Const = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
@@ -334,14 +334,14 @@ static bool isSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
for (const MachineOperand &MO : Def->operands()) {
if (!MO.isReg() || MO.isUndef())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
// If the register is dead here and at Insert, ignore it.
if (MO.isDead() && Insert->definesRegister(Reg) &&
!Insert->readsRegister(Reg))
continue;
- if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ if (Register::isPhysicalRegister(Reg)) {
// Ignore ARGUMENTS; it's just used to keep the ARGUMENT_* instructions
// from moving down, and we've already checked for that.
if (Reg == WebAssembly::ARGUMENTS)
@@ -436,8 +436,8 @@ static bool oneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
const MachineOperand &MO = UseInst->getOperand(0);
if (!MO.isReg())
return false;
- unsigned DefReg = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(DefReg) ||
+ Register DefReg = MO.getReg();
+ if (!Register::isVirtualRegister(DefReg) ||
!MFI.isVRegStackified(DefReg))
return false;
assert(MRI.hasOneNonDBGUse(DefReg));
@@ -499,7 +499,7 @@ static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op,
} else {
// The register may have unrelated uses or defs; create a new register for
// just our one def and use so that we can stackify it.
- unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+ Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
Def->getOperand(0).setReg(NewReg);
Op.setReg(NewReg);
@@ -535,7 +535,7 @@ static MachineInstr *rematerializeCheapDef(
WebAssemblyDebugValueManager DefDIs(&Def);
- unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+ Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI);
Op.setReg(NewReg);
MachineInstr *Clone = &*std::prev(Insert);
@@ -607,8 +607,8 @@ static MachineInstr *moveAndTeeForMultiUse(
// Create the Tee and attach the registers.
const auto *RegClass = MRI.getRegClass(Reg);
- unsigned TeeReg = MRI.createVirtualRegister(RegClass);
- unsigned DefReg = MRI.createVirtualRegister(RegClass);
+ Register TeeReg = MRI.createVirtualRegister(RegClass);
+ Register DefReg = MRI.createVirtualRegister(RegClass);
MachineOperand &DefMO = Def->getOperand(0);
MachineInstr *Tee = BuildMI(MBB, Insert, Insert->getDebugLoc(),
TII->get(getTeeOpcode(RegClass)), TeeReg)
@@ -807,11 +807,11 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
if (!Op.isReg())
continue;
- unsigned Reg = Op.getReg();
+ Register Reg = Op.getReg();
assert(Op.isUse() && "explicit_uses() should only iterate over uses");
assert(!Op.isImplicit() &&
"explicit_uses() should only iterate over explicit operands");
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
continue;
// Identify the definition for this register at this point.
@@ -915,7 +915,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
for (MachineOperand &MO : reverse(MI.explicit_operands())) {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (MFI.isVRegStackified(Reg)) {
if (MO.isDef())
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index ea9cfc00adfd..789a025794ea 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -91,8 +91,8 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
if (MI.getOpcode() == WebAssembly::ADD_I32) {
MachineOperand &OtherMO = MI.getOperand(3 - FIOperandNum);
if (OtherMO.isReg()) {
- unsigned OtherMOReg = OtherMO.getReg();
- if (TargetRegisterInfo::isVirtualRegister(OtherMOReg)) {
+ Register OtherMOReg = OtherMO.getReg();
+ if (Register::isVirtualRegister(OtherMOReg)) {
MachineInstr *Def = MF.getRegInfo().getUniqueVRegDef(OtherMOReg);
// TODO: For now we just opportunistically do this in the case where
// the CONST_I32 happens to have exactly one def and one use. We
@@ -117,7 +117,7 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
// Create i32.add SP, offset and make it the operand.
const TargetRegisterClass *PtrRC =
MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
- unsigned OffsetOp = MRI.createVirtualRegister(PtrRC);
+ Register OffsetOp = MRI.createVirtualRegister(PtrRC);
BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::CONST_I32),
OffsetOp)
.addImm(FrameOffset);
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 7e65368e671a..bdf5fe2620a4 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -140,7 +140,7 @@ WebAssemblyTargetMachine::getSubtargetImpl(std::string CPU,
std::string FS) const {
auto &I = SubtargetMap[CPU + FS];
if (!I) {
- I = llvm::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this);
+ I = std::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this);
}
return I.get();
}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 46ef765ce0f4..1c53e90daea7 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -25,10 +25,11 @@ WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
return TargetTransformInfo::PSK_FastHardware;
}
-unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) {
- unsigned Result = BaseT::getNumberOfRegisters(Vector);
+unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+ unsigned Result = BaseT::getNumberOfRegisters(ClassID);
// For SIMD, use at least 16 registers, as a rough guess.
+ bool Vector = (ClassID == 1);
if (Vector)
Result = std::max(Result, 16u);
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 1b11b4b631eb..f0ecc73e91de 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -53,7 +53,7 @@ public:
/// \name Vector TTI Implementations
/// @{
- unsigned getNumberOfRegisters(bool Vector);
+ unsigned getNumberOfRegisters(unsigned ClassID) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index e9d88d4818a5..a237da8154ab 100644
--- a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -32,9 +32,8 @@ bool WebAssembly::isChild(const MachineInstr &MI,
const MachineOperand &MO = MI.getOperand(0);
if (!MO.isReg() || MO.isImplicit() || !MO.isDef())
return false;
- unsigned Reg = MO.getReg();
- return TargetRegisterInfo::isVirtualRegister(Reg) &&
- MFI.isVRegStackified(Reg);
+ Register Reg = MO.getReg();
+ return Register::isVirtualRegister(Reg) && MFI.isVRegStackified(Reg);
}
bool WebAssembly::mayThrow(const MachineInstr &MI) {
@@ -51,7 +50,21 @@ bool WebAssembly::mayThrow(const MachineInstr &MI) {
return false;
const MachineOperand &MO = MI.getOperand(getCalleeOpNo(MI.getOpcode()));
- assert(MO.isGlobal());
+ assert(MO.isGlobal() || MO.isSymbol());
+
+ if (MO.isSymbol()) {
+ // Some intrinsics are lowered to calls to external symbols, which are then
+ // lowered to calls to library functions. Most of libcalls don't throw, but
+ // we only list some of them here now.
+ // TODO Consider adding 'nounwind' info in TargetLowering::CallLoweringInfo
+ // instead for more accurate info.
+ const char *Name = MO.getSymbolName();
+ if (strcmp(Name, "memcpy") == 0 || strcmp(Name, "memmove") == 0 ||
+ strcmp(Name, "memset") == 0)
+ return false;
+ return true;
+ }
+
const auto *F = dyn_cast<Function>(MO.getGlobal());
if (!F)
return true;
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 95cbf46d37ed..25be79ec2b1e 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -870,6 +870,14 @@ private:
bool parseDirectiveFPOEndProc(SMLoc L);
bool parseDirectiveFPOData(SMLoc L);
+ /// SEH directives.
+ bool parseSEHRegisterNumber(unsigned RegClassID, unsigned &RegNo);
+ bool parseDirectiveSEHPushReg(SMLoc);
+ bool parseDirectiveSEHSetFrame(SMLoc);
+ bool parseDirectiveSEHSaveReg(SMLoc);
+ bool parseDirectiveSEHSaveXMM(SMLoc);
+ bool parseDirectiveSEHPushFrame(SMLoc);
+
unsigned checkTargetMatchPredicate(MCInst &Inst) override;
bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
@@ -955,6 +963,8 @@ private:
public:
enum X86MatchResultTy {
Match_Unsupported = FIRST_TARGET_MATCH_RESULT_TY,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "X86GenAsmMatcher.inc"
};
X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
@@ -3173,6 +3183,13 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
EmitInstruction(Inst, Operands, Out);
Opcode = Inst.getOpcode();
return false;
+ case Match_InvalidImmUnsignedi4: {
+ SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ return Error(ErrorLoc, "immediate must be an integer in range [0, 15]",
+ EmptyRange, MatchingInlineAsm);
+ }
case Match_MissingFeature:
return ErrorMissingFeature(IDLoc, MissingFeatures, MatchingInlineAsm);
case Match_InvalidOperand:
@@ -3520,6 +3537,15 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
MatchingInlineAsm);
}
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_InvalidImmUnsignedi4) == 1) {
+ SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ return Error(ErrorLoc, "immediate must be an integer in range [0, 15]",
+ EmptyRange, MatchingInlineAsm);
+ }
+
// If all of these were an outright failure, report it in a useless way.
return Error(IDLoc, "unknown instruction mnemonic", EmptyRange,
MatchingInlineAsm);
@@ -3572,6 +3598,16 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
return parseDirectiveFPOEndPrologue(DirectiveID.getLoc());
else if (IDVal == ".cv_fpo_endproc")
return parseDirectiveFPOEndProc(DirectiveID.getLoc());
+ else if (IDVal == ".seh_pushreg")
+ return parseDirectiveSEHPushReg(DirectiveID.getLoc());
+ else if (IDVal == ".seh_setframe")
+ return parseDirectiveSEHSetFrame(DirectiveID.getLoc());
+ else if (IDVal == ".seh_savereg")
+ return parseDirectiveSEHSaveReg(DirectiveID.getLoc());
+ else if (IDVal == ".seh_savexmm")
+ return parseDirectiveSEHSaveXMM(DirectiveID.getLoc());
+ else if (IDVal == ".seh_pushframe")
+ return parseDirectiveSEHPushFrame(DirectiveID.getLoc());
return true;
}
@@ -3708,6 +3744,140 @@ bool X86AsmParser::parseDirectiveFPOEndProc(SMLoc L) {
return getTargetStreamer().emitFPOEndProc(L);
}
+bool X86AsmParser::parseSEHRegisterNumber(unsigned RegClassID,
+ unsigned &RegNo) {
+ SMLoc startLoc = getLexer().getLoc();
+ const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+
+ // Try parsing the argument as a register first.
+ if (getLexer().getTok().isNot(AsmToken::Integer)) {
+ SMLoc endLoc;
+ if (ParseRegister(RegNo, startLoc, endLoc))
+ return true;
+
+ if (!X86MCRegisterClasses[RegClassID].contains(RegNo)) {
+ return Error(startLoc,
+ "register is not supported for use with this directive");
+ }
+ } else {
+ // Otherwise, an integer number matching the encoding of the desired
+ // register may appear.
+ int64_t EncodedReg;
+ if (getParser().parseAbsoluteExpression(EncodedReg))
+ return true;
+
+ // The SEH register number is the same as the encoding register number. Map
+ // from the encoding back to the LLVM register number.
+ RegNo = 0;
+ for (MCPhysReg Reg : X86MCRegisterClasses[RegClassID]) {
+ if (MRI->getEncodingValue(Reg) == EncodedReg) {
+ RegNo = Reg;
+ break;
+ }
+ }
+ if (RegNo == 0) {
+ return Error(startLoc,
+ "incorrect register number for use with this directive");
+ }
+ }
+
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHPushReg(SMLoc Loc) {
+ unsigned Reg = 0;
+ if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg))
+ return true;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFIPushReg(Reg, Loc);
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHSetFrame(SMLoc Loc) {
+ unsigned Reg = 0;
+ int64_t Off;
+ if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg))
+ return true;
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("you must specify a stack pointer offset");
+
+ getParser().Lex();
+ if (getParser().parseAbsoluteExpression(Off))
+ return true;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFISetFrame(Reg, Off, Loc);
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHSaveReg(SMLoc Loc) {
+ unsigned Reg = 0;
+ int64_t Off;
+ if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg))
+ return true;
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("you must specify an offset on the stack");
+
+ getParser().Lex();
+ if (getParser().parseAbsoluteExpression(Off))
+ return true;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFISaveReg(Reg, Off, Loc);
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHSaveXMM(SMLoc Loc) {
+ unsigned Reg = 0;
+ int64_t Off;
+ if (parseSEHRegisterNumber(X86::VR128XRegClassID, Reg))
+ return true;
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("you must specify an offset on the stack");
+
+ getParser().Lex();
+ if (getParser().parseAbsoluteExpression(Off))
+ return true;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFISaveXMM(Reg, Off, Loc);
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHPushFrame(SMLoc Loc) {
+ bool Code = false;
+ StringRef CodeID;
+ if (getLexer().is(AsmToken::At)) {
+ SMLoc startLoc = getLexer().getLoc();
+ getParser().Lex();
+ if (!getParser().parseIdentifier(CodeID)) {
+ if (CodeID != "code")
+ return Error(startLoc, "expected @code");
+ Code = true;
+ }
+ }
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFIPushFrame(Code, Loc);
+ return false;
+}
+
// Force static initialization.
extern "C" void LLVMInitializeX86AsmParser() {
RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target());
diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
index 5bc979d1f18c..e9be28ca77b0 100644
--- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h
+++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -35,6 +35,10 @@ inline bool isImmUnsignedi8Value(uint64_t Value) {
return isUInt<8>(Value) || isInt<8>(Value);
}
+inline bool isImmUnsignedi4Value(uint64_t Value) {
+ return isUInt<4>(Value);
+}
+
} // End of namespace llvm
#endif
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index a771ba366318..3a76d023e640 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -260,6 +260,15 @@ struct X86Operand final : public MCParsedAsmOperand {
return isImmSExti64i32Value(CE->getValue());
}
+ bool isImmUnsignedi4() const {
+ if (!isImm()) return false;
+ // If this isn't a constant expr, reject it. The immediate byte is shared
+ // with a register encoding. We can't have it affected by a relocation.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ return isImmUnsignedi4Value(CE->getValue());
+ }
+
bool isImmUnsignedi8() const {
if (!isImm()) return false;
// If this isn't a constant expr, just assume it fits and let relaxation
@@ -491,7 +500,7 @@ struct X86Operand final : public MCParsedAsmOperand {
void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- unsigned RegNo = getReg();
+ MCRegister RegNo = getReg();
if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
RegNo = getX86SubSuperRegister(RegNo, 32);
Inst.addOperand(MCOperand::createReg(RegNo));
@@ -572,7 +581,7 @@ struct X86Operand final : public MCParsedAsmOperand {
static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) {
SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
- auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc);
+ auto Res = std::make_unique<X86Operand>(Token, Loc, EndLoc);
Res->Tok.Data = Str.data();
Res->Tok.Length = Str.size();
return Res;
@@ -582,7 +591,7 @@ struct X86Operand final : public MCParsedAsmOperand {
CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(),
StringRef SymName = StringRef(), void *OpDecl = nullptr) {
- auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc);
+ auto Res = std::make_unique<X86Operand>(Register, StartLoc, EndLoc);
Res->Reg.RegNo = RegNo;
Res->AddressOf = AddressOf;
Res->OffsetOfLoc = OffsetOfLoc;
@@ -593,19 +602,19 @@ struct X86Operand final : public MCParsedAsmOperand {
static std::unique_ptr<X86Operand>
CreateDXReg(SMLoc StartLoc, SMLoc EndLoc) {
- return llvm::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc);
+ return std::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc);
}
static std::unique_ptr<X86Operand>
CreatePrefix(unsigned Prefixes, SMLoc StartLoc, SMLoc EndLoc) {
- auto Res = llvm::make_unique<X86Operand>(Prefix, StartLoc, EndLoc);
+ auto Res = std::make_unique<X86Operand>(Prefix, StartLoc, EndLoc);
Res->Pref.Prefixes = Prefixes;
return Res;
}
static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
SMLoc StartLoc, SMLoc EndLoc) {
- auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
+ auto Res = std::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
Res->Imm.Val = Val;
return Res;
}
@@ -615,7 +624,7 @@ struct X86Operand final : public MCParsedAsmOperand {
CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
unsigned Size = 0, StringRef SymName = StringRef(),
void *OpDecl = nullptr, unsigned FrontendSize = 0) {
- auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+ auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
Res->Mem.SegReg = 0;
Res->Mem.Disp = Disp;
Res->Mem.BaseReg = 0;
@@ -643,7 +652,7 @@ struct X86Operand final : public MCParsedAsmOperand {
// The scale should always be one of {1,2,4,8}.
assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
"Invalid scale!");
- auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+ auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
Res->Mem.SegReg = SegReg;
Res->Mem.Disp = Disp;
Res->Mem.BaseReg = BaseReg;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index a241362a271d..e287f6625115 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -12,13 +12,14 @@
//
//===----------------------------------------------------------------------===//
+#include "X86DisassemblerDecoder.h"
+#include "llvm/ADT/StringRef.h"
+
#include <cstdarg> /* for va_*() */
#include <cstdio> /* for vsnprintf() */
#include <cstdlib> /* for exit() */
#include <cstring> /* for memset() */
-#include "X86DisassemblerDecoder.h"
-
using namespace llvm::X86Disassembler;
/// Specifies whether a ModR/M byte is needed and (if so) which
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 54413fa1a02f..f08fcb575bf0 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -287,7 +287,7 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const {
// Relax if the value is too big for a (signed) i8.
- return int64_t(Value) != int64_t(int8_t(Value));
+ return !isInt<8>(Value);
}
// FIXME: Can tblgen help at all here to verify there aren't other instructions
@@ -557,7 +557,7 @@ protected:
// If the frame pointer is other than esp/rsp, we do not have a way to
// generate a compact unwinding representation, so bail out.
- if (MRI.getLLVMRegNum(Inst.getRegister(), true) !=
+ if (*MRI.getLLVMRegNum(Inst.getRegister(), true) !=
(Is64Bit ? X86::RBP : X86::EBP))
return 0;
@@ -605,7 +605,7 @@ protected:
// unwind encoding.
return CU::UNWIND_MODE_DWARF;
- unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ unsigned Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
SavedRegs[SavedRegIdx++] = Reg;
StackAdjust += OffsetSize;
InstrOffset += PushInstrSize(Reg);
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 232a06593238..bd009da60851 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -46,10 +46,10 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
enum X86_64RelType { RT64_NONE, RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
-static X86_64RelType getType64(unsigned Kind,
+static X86_64RelType getType64(MCFixupKind Kind,
MCSymbolRefExpr::VariantKind &Modifier,
bool &IsPCRel) {
- switch (Kind) {
+ switch (unsigned(Kind)) {
default:
llvm_unreachable("Unimplemented");
case FK_NONE:
@@ -97,7 +97,7 @@ static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
MCSymbolRefExpr::VariantKind Modifier,
X86_64RelType Type, bool IsPCRel,
- unsigned Kind) {
+ MCFixupKind Kind) {
switch (Modifier) {
default:
llvm_unreachable("Unimplemented");
@@ -202,7 +202,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
// and we want to keep back-compatibility.
if (!Ctx.getAsmInfo()->canRelaxRelocations())
return ELF::R_X86_64_GOTPCREL;
- switch (Kind) {
+ switch (unsigned(Kind)) {
default:
return ELF::R_X86_64_GOTPCREL;
case X86::reloc_riprel_4byte_relax:
@@ -237,7 +237,7 @@ static X86_32RelType getType32(X86_64RelType T) {
static unsigned getRelocType32(MCContext &Ctx,
MCSymbolRefExpr::VariantKind Modifier,
X86_32RelType Type, bool IsPCRel,
- unsigned Kind) {
+ MCFixupKind Kind) {
switch (Modifier) {
default:
llvm_unreachable("Unimplemented");
@@ -265,8 +265,9 @@ static unsigned getRelocType32(MCContext &Ctx,
if (!Ctx.getAsmInfo()->canRelaxRelocations())
return ELF::R_386_GOT32;
- return Kind == X86::reloc_signed_4byte_relax ? ELF::R_386_GOT32X
- : ELF::R_386_GOT32;
+ return Kind == MCFixupKind(X86::reloc_signed_4byte_relax)
+ ? ELF::R_386_GOT32X
+ : ELF::R_386_GOT32;
case MCSymbolRefExpr::VK_GOTOFF:
assert(Type == RT32_32);
assert(!IsPCRel);
@@ -317,7 +318,7 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
const MCFixup &Fixup,
bool IsPCRel) const {
MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
- unsigned Kind = Fixup.getKind();
+ MCFixupKind Kind = Fixup.getKind();
X86_64RelType Type = getType64(Kind, Modifier, IsPCRel);
if (getEMachine() == ELF::EM_X86_64)
return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind);
@@ -329,5 +330,5 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
std::unique_ptr<MCObjectTargetWriter>
llvm::createX86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) {
- return llvm::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine);
+ return std::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index e1125c176b25..d986c829d98e 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -163,5 +163,7 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
TextAlignFillValue = 0x90;
+ AllowAtInName = true;
+
UseIntegratedAssembler = true;
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 31d26d08a63f..ac36bf3a12fa 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -862,6 +862,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
VEX_B = ~(BaseRegEnc >> 3) & 1;
unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
VEX_X = ~(IndexRegEnc >> 3) & 1;
+ if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
+ EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
+
break;
}
case X86II::MRMSrcReg: {
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index ce05ad974507..ced9eacc8b97 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -70,6 +70,10 @@ unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) {
return DWARFFlavour::X86_32_Generic;
}
+bool X86_MC::hasLockPrefix(const MCInst &MI) {
+ return MI.getFlags() & X86::IP_HAS_LOCK;
+}
+
void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
// FIXME: TableGen these.
for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
@@ -399,6 +403,9 @@ public:
findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
uint64_t GotSectionVA,
const Triple &TargetTriple) const override;
+ Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst,
+ uint64_t Addr,
+ uint64_t Size) const override;
};
#define GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS
@@ -511,7 +518,31 @@ std::vector<std::pair<uint64_t, uint64_t>> X86MCInstrAnalysis::findPltEntries(
return findX86_64PltEntries(PltSectionVA, PltContents);
default:
return {};
- }
+ }
+}
+
+Optional<uint64_t> X86MCInstrAnalysis::evaluateMemoryOperandAddress(
+ const MCInst &Inst, uint64_t Addr, uint64_t Size) const {
+ const MCInstrDesc &MCID = Info->get(Inst.getOpcode());
+ int MemOpStart = X86II::getMemoryOperandNo(MCID.TSFlags);
+ if (MemOpStart == -1)
+ return None;
+ MemOpStart += X86II::getOperandBias(MCID);
+
+ const MCOperand &SegReg = Inst.getOperand(MemOpStart + X86::AddrSegmentReg);
+ const MCOperand &BaseReg = Inst.getOperand(MemOpStart + X86::AddrBaseReg);
+ const MCOperand &IndexReg = Inst.getOperand(MemOpStart + X86::AddrIndexReg);
+ const MCOperand &ScaleAmt = Inst.getOperand(MemOpStart + X86::AddrScaleAmt);
+ const MCOperand &Disp = Inst.getOperand(MemOpStart + X86::AddrDisp);
+ if (SegReg.getReg() != 0 || IndexReg.getReg() != 0 || ScaleAmt.getImm() != 1 ||
+ !Disp.isImm())
+ return None;
+
+ // RIP-relative addressing.
+ if (BaseReg.getReg() == X86::RIP)
+ return Addr + Size + Disp.getImm();
+
+ return None;
}
} // end of namespace X86_MC
@@ -567,13 +598,13 @@ extern "C" void LLVMInitializeX86TargetMC() {
createX86_64AsmBackend);
}
-unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
- bool High) {
+MCRegister llvm::getX86SubSuperRegisterOrZero(MCRegister Reg, unsigned Size,
+ bool High) {
switch (Size) {
- default: return 0;
+ default: return X86::NoRegister;
case 8:
if (High) {
- switch (Reg) {
+ switch (Reg.id()) {
default: return getX86SubSuperRegisterOrZero(Reg, 64);
case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
return X86::SI;
@@ -593,8 +624,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
return X86::BH;
}
} else {
- switch (Reg) {
- default: return 0;
+ switch (Reg.id()) {
+ default: return X86::NoRegister;
case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
return X86::AL;
case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -630,8 +661,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
}
}
case 16:
- switch (Reg) {
- default: return 0;
+ switch (Reg.id()) {
+ default: return X86::NoRegister;
case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
return X86::AX;
case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -666,8 +697,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
return X86::R15W;
}
case 32:
- switch (Reg) {
- default: return 0;
+ switch (Reg.id()) {
+ default: return X86::NoRegister;
case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
return X86::EAX;
case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -702,7 +733,7 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
return X86::R15D;
}
case 64:
- switch (Reg) {
+ switch (Reg.id()) {
default: return 0;
case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
return X86::RAX;
@@ -740,9 +771,9 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
}
}
-unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) {
- unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
- assert(Res != 0 && "Unexpected register or VT");
+MCRegister llvm::getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High) {
+ MCRegister Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
+ assert(Res != X86::NoRegister && "Unexpected register or VT");
return Res;
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 00dd5908cbf5..0c789061f0e1 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+#include "llvm/MC/MCRegister.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/Support/DataTypes.h"
#include <string>
@@ -57,6 +58,10 @@ unsigned getDwarfRegFlavour(const Triple &TT, bool isEH);
void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI);
+
+/// Returns true if this instruction has a LOCK prefix.
+bool hasLockPrefix(const MCInst &MI);
+
/// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc.
/// do not need to go through TargetRegistry.
MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
@@ -111,12 +116,12 @@ createX86WinCOFFObjectWriter(bool Is64Bit);
/// Returns the sub or super register of a specific X86 register.
/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
/// Aborts on error.
-unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false);
+MCRegister getX86SubSuperRegister(MCRegister, unsigned, bool High=false);
/// Returns the sub or super register of a specific X86 register.
/// Like getX86SubSuperRegister() but returns 0 on error.
-unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned,
- bool High = false);
+MCRegister getX86SubSuperRegisterOrZero(MCRegister, unsigned,
+ bool High = false);
} // End llvm namespace
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index fc7e99f61e5e..b67a7508fe72 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -276,7 +276,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
// x86_64 distinguishes movq foo@GOTPCREL so that the linker can
// rewrite the movq to an leaq at link time if the symbol ends up in
// the same linkage unit.
- if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
+ if (Fixup.getTargetKind() == X86::reloc_riprel_4byte_movq_load)
Type = MachO::X86_64_RELOC_GOT_LOAD;
else
Type = MachO::X86_64_RELOC_GOT;
@@ -339,8 +339,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
return;
} else {
Type = MachO::X86_64_RELOC_UNSIGNED;
- unsigned Kind = Fixup.getKind();
- if (Kind == X86::reloc_signed_4byte) {
+ if (Fixup.getTargetKind() == X86::reloc_signed_4byte) {
Asm.getContext().reportError(
Fixup.getLoc(),
"32-bit absolute addressing is not supported in 64-bit mode");
@@ -600,5 +599,5 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
std::unique_ptr<MCObjectTargetWriter>
llvm::createX86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
uint32_t CPUSubtype) {
- return llvm::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
+ return std::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 3baab9da1c41..760239f76505 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -109,5 +109,5 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
std::unique_ptr<MCObjectTargetWriter>
llvm::createX86WinCOFFObjectWriter(bool Is64Bit) {
- return llvm::make_unique<X86WinCOFFObjectWriter>(Is64Bit);
+ return std::make_unique<X86WinCOFFObjectWriter>(Is64Bit);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 796a27a17255..db624378d517 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -35,8 +35,9 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
MCStreamer::EmitWinEHHandlerData(Loc);
// We have to emit the unwind info now, because this directive
- // actually switches to the .xdata section!
- EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+ // actually switches to the .xdata section.
+ if (WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo())
+ EHStreamer.EmitUnwindInfo(*this, CurFrame);
}
void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index e9987d1f62bd..d5494ef12370 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -170,7 +170,7 @@ bool X86WinCOFFTargetStreamer::emitFPOProc(const MCSymbol *ProcSym,
L, "opening new .cv_fpo_proc before closing previous frame");
return true;
}
- CurFPOData = llvm::make_unique<FPOData>();
+ CurFPOData = std::make_unique<FPOData>();
CurFPOData->Function = ProcSym;
CurFPOData->Begin = emitFPOLabel();
CurFPOData->ParamsSize = ParamsSize;
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index a95f68434d12..6840fc12751d 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -81,6 +81,12 @@ FunctionPass *createX86FlagsCopyLoweringPass();
/// Return a pass that expands WinAlloca pseudo-instructions.
FunctionPass *createX86WinAllocaExpander();
+/// Return a pass that inserts int3 at the end of the function if it ends with a
+/// CALL instruction. The pass does the same for each funclet as well. This
+/// ensures that the open interval of function start and end PCs contains all
+/// return addresses for the benefit of the Windows x64 unwinder.
+FunctionPass *createX86AvoidTrailingCallPass();
+
/// Return a pass that optimizes the code-size of x86 call sequences. This is
/// done by replacing esp-relative movs with pushes.
FunctionPass *createX86CallFrameOptimization();
@@ -137,13 +143,13 @@ void initializeWinEHStatePassPass(PassRegistry &);
void initializeX86AvoidSFBPassPass(PassRegistry &);
void initializeX86CallFrameOptimizationPass(PassRegistry &);
void initializeX86CmovConverterPassPass(PassRegistry &);
-void initializeX86ExpandPseudoPass(PassRegistry&);
void initializeX86CondBrFoldingPassPass(PassRegistry &);
void initializeX86DomainReassignmentPass(PassRegistry &);
void initializeX86ExecutionDomainFixPass(PassRegistry &);
+void initializeX86ExpandPseudoPass(PassRegistry &);
void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
+void initializeX86OptimizeLEAPassPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
-
} // End llvm namespace
#endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 3112f00c91f2..d8631aca2734 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -95,7 +95,8 @@ def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
"Support 64-bit instructions">;
def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
- "64-bit with cmpxchg16b">;
+ "64-bit with cmpxchg16b",
+ [FeatureCMPXCHG8B]>;
def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
"SHLD instruction is slow">;
def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
@@ -240,8 +241,11 @@ def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
"Enable Cache Demote">;
def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
"Support ptwrite instruction">;
-def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true",
- "Support MPX instructions">;
+// FIXME: This feature is deprecated in 10.0 and should not be used for
+// anything, but removing it would break IR files that may contain it in a
+// target-feature attribute.
+def FeatureDeprecatedMPX : SubtargetFeature<"mpx", "DeprecatedHasMPX", "false",
+ "Deprecated. Support MPX instructions">;
def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
"Use LEA for adjusting the stack pointer">;
def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
@@ -374,6 +378,10 @@ def FeatureHasFastGather
: SubtargetFeature<"fast-gather", "HasFastGather", "true",
"Indicates if gather is reasonably fast">;
+def FeaturePrefer128Bit
+ : SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true",
+ "Prefer 128-bit AVX instructions">;
+
def FeaturePrefer256Bit
: SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
"Prefer 256-bit AVX instructions">;
@@ -449,6 +457,10 @@ def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
"Merge branches to a three-way "
"conditional branch">;
+// Enable use of alias analysis during code generation.
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
+ "Use alias analysis during codegen">;
+
// Bonnell
def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
// Silvermont
@@ -579,7 +591,6 @@ def ProcessorFeatures {
// Skylake
list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
- FeatureMPX,
FeatureXSAVEC,
FeatureXSAVES,
FeatureCLFLUSHOPT,
@@ -594,6 +605,7 @@ def ProcessorFeatures {
// Skylake-AVX512
list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512,
+ FeaturePrefer256Bit,
FeatureCDI,
FeatureDQI,
FeatureBWI,
@@ -627,6 +639,7 @@ def ProcessorFeatures {
// Cannonlake
list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
+ FeaturePrefer256Bit,
FeatureCDI,
FeatureDQI,
FeatureBWI,
@@ -665,6 +678,17 @@ def ProcessorFeatures {
list<SubtargetFeature> ICXFeatures =
!listconcat(ICLInheritableFeatures, ICXSpecificFeatures);
+ //Tigerlake
+ list<SubtargetFeature> TGLAdditionalFeatures = [FeatureVP2INTERSECT,
+ FeatureMOVDIRI,
+ FeatureMOVDIR64B,
+ FeatureSHSTK];
+ list<SubtargetFeature> TGLSpecificFeatures = [FeatureHasFastGather];
+ list<SubtargetFeature> TGLInheritableFeatures =
+ !listconcat(TGLAdditionalFeatures ,TGLSpecificFeatures);
+ list<SubtargetFeature> TGLFeatures =
+ !listconcat(ICLFeatures, TGLInheritableFeatures );
+
// Atom
list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87,
FeatureCMPXCHG8B,
@@ -707,7 +731,6 @@ def ProcessorFeatures {
// Goldmont
list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES,
- FeatureMPX,
FeatureSHA,
FeatureRDSEED,
FeatureXSAVE,
@@ -786,6 +809,22 @@ def ProcessorFeatures {
list<SubtargetFeature> KNMFeatures =
!listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);
+ // Barcelona
+ list<SubtargetFeature> BarcelonaInheritableFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureSSE4A,
+ Feature3DNowA,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureCMPXCHG16B,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF,
+ FeatureCMOV,
+ Feature64Bit,
+ FeatureFastScalarShiftMasks];
+ list<SubtargetFeature> BarcelonaFeatures = BarcelonaInheritableFeatures;
// Bobcat
list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87,
@@ -1093,6 +1132,8 @@ def : ProcessorModel<"icelake-client", SkylakeServerModel,
ProcessorFeatures.ICLFeatures>;
def : ProcessorModel<"icelake-server", SkylakeServerModel,
ProcessorFeatures.ICXFeatures>;
+def : ProcessorModel<"tigerlake", SkylakeServerModel,
+ ProcessorFeatures.TGLFeatures>;
// AMD CPUs.
@@ -1129,10 +1170,7 @@ foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
}
foreach P = ["amdfam10", "barcelona"] in {
- def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE4A, Feature3DNowA,
- FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT,
- FeaturePOPCNT, FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV,
- Feature64Bit, FeatureFastScalarShiftMasks]>;
+ def : Proc<P, ProcessorFeatures.BarcelonaFeatures>;
}
// Bobcat
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 80120722e0e6..8d27be30a277 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -242,7 +242,7 @@ void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo,
return PrintOperand(MI, OpNo, O);
if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT)
O << '%';
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
(strcmp(Modifier+6,"32") == 0) ? 32 :
@@ -388,7 +388,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
char Mode, raw_ostream &O) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
bool EmitPercent = true;
if (!X86::GR8RegClass.contains(Reg) &&
@@ -575,7 +575,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
// Emitting note header.
int WordSize = TT.isArch64Bit() ? 8 : 4;
- EmitAlignment(WordSize == 4 ? 2 : 3);
+ EmitAlignment(WordSize == 4 ? Align(4) : Align(8));
OutStreamer->EmitIntValue(4, 4 /*size*/); // data size for "GNU\0"
OutStreamer->EmitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size
OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/);
@@ -585,7 +585,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_X86_FEATURE_1_AND, 4);
OutStreamer->EmitIntValue(4, 4); // data size
OutStreamer->EmitIntValue(FeatureFlagsAnd, 4); // data
- EmitAlignment(WordSize == 4 ? 2 : 3); // padding
+ EmitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding
OutStreamer->endSection(Nt);
OutStreamer->SwitchSection(Cur);
diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 3dcc1015dc7c..69c6b3356cbb 100644
--- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -35,6 +35,7 @@
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -390,7 +391,7 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
- unsigned Reg1 = MRI->createVirtualRegister(
+ Register Reg1 = MRI->createVirtualRegister(
TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
MachineInstr *NewLoad =
BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
diff --git a/lib/Target/X86/X86AvoidTrailingCall.cpp b/lib/Target/X86/X86AvoidTrailingCall.cpp
new file mode 100644
index 000000000000..fb4f9e2901dc
--- /dev/null
+++ b/lib/Target/X86/X86AvoidTrailingCall.cpp
@@ -0,0 +1,108 @@
+//===----- X86AvoidTrailingCall.cpp - Insert int3 after trailing calls ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The Windows x64 unwinder has trouble unwinding the stack when a return
+// address points to the end of the function. This pass maintains the invariant
+// that every return address is inside the bounds of its parent function or
+// funclet by inserting int3 if the last instruction would otherwise be a call.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+#define DEBUG_TYPE "x86-avoid-trailing-call"
+
+using namespace llvm;
+
+namespace {
+
+class X86AvoidTrailingCallPass : public MachineFunctionPass {
+public:
+ X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ StringRef getPassName() const override {
+ return "X86 avoid trailing call pass";
+ }
+ static char ID;
+};
+
+char X86AvoidTrailingCallPass::ID = 0;
+
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86AvoidTrailingCallPass() {
+ return new X86AvoidTrailingCallPass();
+}
+
+// A real instruction is a non-meta, non-pseudo instruction. Some pseudos
+// expand to nothing, and some expand to code. This logic conservatively assumes
+// they might expand to nothing.
+static bool isRealInstruction(MachineInstr &MI) {
+ return !MI.isPseudo() && !MI.isMetaInstruction();
+}
+
+// Return true if this is a call instruction, but not a tail call.
+static bool isCallInstruction(const MachineInstr &MI) {
+ return MI.isCall() && !MI.isReturn();
+}
+
+bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const X86InstrInfo &TII = *STI.getInstrInfo();
+ assert(STI.isTargetWin64() && "pass only runs on Win64");
+
+ // FIXME: Perhaps this pass should also replace SEH_Epilogue by inserting nops
+ // before epilogues.
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ // Look for basic blocks that precede funclet entries or are at the end of
+ // the function.
+ MachineBasicBlock *NextMBB = MBB.getNextNode();
+ if (NextMBB && !NextMBB->isEHFuncletEntry())
+ continue;
+
+ // Find the last real instruction in this block, or previous blocks if this
+ // block is empty.
+ MachineBasicBlock::reverse_iterator LastRealInstr;
+ for (MachineBasicBlock &RMBB :
+ make_range(MBB.getReverseIterator(), MF.rend())) {
+ LastRealInstr = llvm::find_if(reverse(RMBB), isRealInstruction);
+ if (LastRealInstr != RMBB.rend())
+ break;
+ }
+
+ // Do nothing if this function or funclet has no instructions.
+ if (LastRealInstr == MF.begin()->rend())
+ continue;
+
+ // If this is a call instruction, insert int3 right after it with the same
+ // DebugLoc. Convert back to a forward iterator and advance the insertion
+ // position once.
+ if (isCallInstruction(*LastRealInstr)) {
+ LLVM_DEBUG({
+ dbgs() << "inserting int3 after trailing call instruction:\n";
+ LastRealInstr->dump();
+ dbgs() << '\n';
+ });
+
+ MachineBasicBlock::iterator MBBI = std::next(LastRealInstr.getReverse());
+ BuildMI(*LastRealInstr->getParent(), MBBI, LastRealInstr->getDebugLoc(),
+ TII.get(X86::INT3));
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 4df849a2e14c..ad7e32b4efc8 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -155,12 +155,22 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
// This is bad, and breaks SP adjustment.
// So, check that all of the frames in the function are closed inside
// the same block, and, for good measure, that there are no nested frames.
+ //
+ // If any call allocates more argument stack memory than the stack
+ // probe size, don't do this optimization. Otherwise, this pass
+ // would need to synthesize additional stack probe calls to allocate
+ // memory for arguments.
unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+ bool UseStackProbe =
+ !STI->getTargetLowering()->getStackProbeSymbolName(MF).empty();
+ unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF);
for (MachineBasicBlock &BB : MF) {
bool InsideFrameSequence = false;
for (MachineInstr &MI : BB) {
if (MI.getOpcode() == FrameSetupOpcode) {
+ if (TII->getFrameSize(MI) >= StackProbeSize && UseStackProbe)
+ return false;
if (InsideFrameSequence)
return false;
InsideFrameSequence = true;
@@ -325,8 +335,8 @@ X86CallFrameOptimization::classifyInstruction(
for (const MachineOperand &MO : MI->operands()) {
if (!MO.isReg())
continue;
- unsigned int Reg = MO.getReg();
- if (!RegInfo.isPhysicalRegister(Reg))
+ Register Reg = MO.getReg();
+ if (!Register::isPhysicalRegister(Reg))
continue;
if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
return Exit;
@@ -370,7 +380,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
while (I->getOpcode() == X86::LEA32r || I->isDebugInstr())
++I;
- unsigned StackPtr = RegInfo.getStackRegister();
+ Register StackPtr = RegInfo.getStackRegister();
auto StackPtrCopyInst = MBB.end();
// SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual
// register. If it's there, use that virtual register as stack pointer
@@ -443,8 +453,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
for (const MachineOperand &MO : I->uses()) {
if (!MO.isReg())
continue;
- unsigned int Reg = MO.getReg();
- if (RegInfo.isPhysicalRegister(Reg))
+ Register Reg = MO.getReg();
+ if (Register::isPhysicalRegister(Reg))
UsedRegs.insert(Reg);
}
}
@@ -524,12 +534,12 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
break;
case X86::MOV32mr:
case X86::MOV64mr: {
- unsigned int Reg = PushOp.getReg();
+ Register Reg = PushOp.getReg();
// If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg
// in preparation for the PUSH64. The upper 32 bits can be undef.
if (Is64Bit && Store->getOpcode() == X86::MOV32mr) {
- unsigned UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);
Reg = MRI->createVirtualRegister(&X86::GR64RegClass);
BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg);
BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)
@@ -598,7 +608,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
// movl %eax, (%esp)
// call
// Get rid of those with prejudice.
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
return nullptr;
// Make sure this is the only use of Reg.
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index b16b3839c85a..7ee637cfd523 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -102,6 +102,8 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
DL(MIRBuilder.getMF().getDataLayout()),
STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {}
+ bool isIncomingArgumentHandler() const override { return false; }
+
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
@@ -155,8 +157,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info, CCState &State) override {
- bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+ CCState &State) override {
+ bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
StackSize = State.getNextStackOffset();
static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
@@ -229,7 +232,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
: ValueHandler(MIRBuilder, MRI, AssignFn),
DL(MIRBuilder.getMF().getDataLayout()) {}
- bool isArgumentHandler() const override { return true; }
+ bool isIncomingArgumentHandler() const override { return true; }
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
@@ -237,7 +240,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
int FI = MFI.CreateFixedObject(Size, Offset, true);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
- unsigned AddrReg = MRI.createGenericVirtualRegister(
+ Register AddrReg = MRI.createGenericVirtualRegister(
LLT::pointer(0, DL.getPointerSizeInBits(0)));
MIRBuilder.buildFrameIndex(AddrReg, FI);
return AddrReg;
@@ -301,6 +304,7 @@ struct FormalArgHandler : public IncomingValueHandler {
: IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
void markPhysRegUsed(unsigned PhysReg) override {
+ MIRBuilder.getMRI()->addLiveIn(PhysReg);
MIRBuilder.getMBB().addLiveIn(PhysReg);
}
};
@@ -372,10 +376,7 @@ bool X86CallLowering::lowerFormalArguments(
}
bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
- CallingConv::ID CallConv,
- const MachineOperand &Callee,
- const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const {
+ CallLoweringInfo &Info) const {
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -385,8 +386,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
auto TRI = STI.getRegisterInfo();
// Handle only Linux C, X86_64_SysV calling conventions for now.
- if (!STI.isTargetLinux() ||
- !(CallConv == CallingConv::C || CallConv == CallingConv::X86_64_SysV))
+ if (!STI.isTargetLinux() || !(Info.CallConv == CallingConv::C ||
+ Info.CallConv == CallingConv::X86_64_SysV))
return false;
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
@@ -395,18 +396,19 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Create a temporarily-floating call instruction so we can add the implicit
// uses of arg registers.
bool Is64Bit = STI.is64Bit();
- unsigned CallOpc = Callee.isReg()
+ unsigned CallOpc = Info.Callee.isReg()
? (Is64Bit ? X86::CALL64r : X86::CALL32r)
: (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
- auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc).add(Callee).addRegMask(
- TRI->getCallPreservedMask(MF, CallConv));
+ auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc)
+ .add(Info.Callee)
+ .addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
SmallVector<ArgInfo, 8> SplitArgs;
- for (const auto &OrigArg : OrigArgs) {
+ for (const auto &OrigArg : Info.OrigArgs) {
// TODO: handle not simple cases.
- if (OrigArg.Flags.isByVal())
+ if (OrigArg.Flags[0].isByVal())
return false;
if (OrigArg.Regs.size() > 1)
@@ -423,8 +425,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
- bool IsFixed = OrigArgs.empty() ? true : OrigArgs.back().IsFixed;
- if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(CallConv)) {
+ bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed;
+ if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) {
// From AMD64 ABI document:
// For calls that may call functions that use varargs or stdargs
// (prototype-less calls or calls to functions containing ellipsis (...) in
@@ -445,23 +447,24 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// If Callee is a reg, since it is used by a target specific
// instruction, it must have a register class matching the
// constraint of that instruction.
- if (Callee.isReg())
+ if (Info.Callee.isReg())
MIB->getOperand(0).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
- *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0));
+ *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
+ 0));
// Finally we can copy the returned value back into its virtual-register. In
// symmetry with the arguments, the physical register must be an
// implicit-define of the call instruction.
- if (!OrigRet.Ty->isVoidTy()) {
- if (OrigRet.Regs.size() > 1)
+ if (!Info.OrigRet.Ty->isVoidTy()) {
+ if (Info.OrigRet.Regs.size() > 1)
return false;
SplitArgs.clear();
SmallVector<Register, 8> NewRegs;
- if (!splitToValueTypes(OrigRet, SplitArgs, DL, MRI,
+ if (!splitToValueTypes(Info.OrigRet, SplitArgs, DL, MRI,
[&](ArrayRef<Register> Regs) {
NewRegs.assign(Regs.begin(), Regs.end());
}))
@@ -472,7 +475,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
if (!NewRegs.empty())
- MIRBuilder.buildMerge(OrigRet.Regs[0], NewRegs);
+ MIRBuilder.buildMerge(Info.OrigRet.Regs[0], NewRegs);
}
CallSeqStart.addImm(Handler.getStackSize())
diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h
index 0445331bc3ff..444a0c7d0122 100644
--- a/lib/Target/X86/X86CallLowering.h
+++ b/lib/Target/X86/X86CallLowering.h
@@ -34,9 +34,8 @@ public:
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const override;
- bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
- const MachineOperand &Callee, const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const override;
+ bool lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const override;
private:
/// A function of this type is used to perform value split action.
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 1c3034a5116a..4c49d68bec99 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -433,6 +433,7 @@ defm X86_SysV64_RegCall :
def RetCC_X86_32 : CallingConv<[
// If FastCC, use RetCC_X86_32_Fast.
CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+ CCIfCC<"CallingConv::Tail", CCDelegateTo<RetCC_X86_32_Fast>>,
// If HiPE, use RetCC_X86_32_HiPE.
CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
@@ -1000,6 +1001,7 @@ def CC_X86_32 : CallingConv<[
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+ CCIfCC<"CallingConv::Tail", CCDelegateTo<CC_X86_32_FastCC>>,
CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>,
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
index a61fa3246f09..5123853f5455 100644
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -436,8 +436,8 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
// Checks for "isUse()" as "uses()" returns also implicit definitions.
if (!MO.isReg() || !MO.isUse())
continue;
- unsigned Reg = MO.getReg();
- auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)];
+ Register Reg = MO.getReg();
+ auto &RDM = RegDefMaps[Register::isVirtualRegister(Reg)];
if (MachineInstr *DefMI = RDM.lookup(Reg)) {
OperandToDefMap[&MO] = DefMI;
DepthInfo Info = DepthMap.lookup(DefMI);
@@ -456,8 +456,8 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
for (auto &MO : MI.operands()) {
if (!MO.isReg() || !MO.isDef())
continue;
- unsigned Reg = MO.getReg();
- RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI;
+ Register Reg = MO.getReg();
+ RegDefMaps[Register::isVirtualRegister(Reg)][Reg] = &MI;
}
unsigned Latency = TSchedModel.computeInstrLatency(&MI);
@@ -710,7 +710,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
// Skip any CMOVs in this group which don't load from memory.
if (!MI.mayLoad()) {
// Remember the false-side register input.
- unsigned FalseReg =
+ Register FalseReg =
MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg();
// Walk back through any intermediate cmovs referenced.
while (true) {
@@ -753,7 +753,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
// Get a fresh register to use as the destination of the MOV.
const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
- unsigned TmpReg = MRI->createVirtualRegister(RC);
+ Register TmpReg = MRI->createVirtualRegister(RC);
SmallVector<MachineInstr *, 4> NewMIs;
bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
@@ -810,9 +810,9 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
- unsigned DestReg = MIIt->getOperand(0).getReg();
- unsigned Op1Reg = MIIt->getOperand(1).getReg();
- unsigned Op2Reg = MIIt->getOperand(2).getReg();
+ Register DestReg = MIIt->getOperand(0).getReg();
+ Register Op1Reg = MIIt->getOperand(1).getReg();
+ Register Op2Reg = MIIt->getOperand(2).getReg();
// If this CMOV we are processing is the opposite condition from the jump we
// generated, then we have to swap the operands for the PHI that is going to
diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp
index 9dea94f1368d..1bf2d5ba7b8f 100644
--- a/lib/Target/X86/X86CondBrFolding.cpp
+++ b/lib/Target/X86/X86CondBrFolding.cpp
@@ -564,7 +564,7 @@ X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) {
Modified = false;
break;
}
- return llvm::make_unique<TargetMBBInfo>(TargetMBBInfo{
+ return std::make_unique<TargetMBBInfo>(TargetMBBInfo{
TBB, FBB, BrInstr, CmpInstr, CC, SrcReg, CmpValue, Modified, CmpBrOnly});
}
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index 18bbfa32e11b..b4cf5cafbc6e 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -182,7 +182,7 @@ public:
MachineBasicBlock *MBB = MI->getParent();
auto &DL = MI->getDebugLoc();
- unsigned Reg = MRI->createVirtualRegister(
+ Register Reg = MRI->createVirtualRegister(
TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(),
*MBB->getParent()));
MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg);
@@ -219,13 +219,13 @@ public:
// Don't allow copies to/flow GR8/GR16 physical registers.
// FIXME: Is there some better way to support this?
- unsigned DstReg = MI->getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+ Register DstReg = MI->getOperand(0).getReg();
+ if (Register::isPhysicalRegister(DstReg) &&
(X86::GR8RegClass.contains(DstReg) ||
X86::GR16RegClass.contains(DstReg)))
return false;
- unsigned SrcReg = MI->getOperand(1).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+ Register SrcReg = MI->getOperand(1).getReg();
+ if (Register::isPhysicalRegister(SrcReg) &&
(X86::GR8RegClass.contains(SrcReg) ||
X86::GR16RegClass.contains(SrcReg)))
return false;
@@ -241,7 +241,7 @@ public:
// Physical registers will not be converted. Assume that converting the
// COPY to the destination domain will eventually result in a actual
// instruction.
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ if (Register::isPhysicalRegister(MO.getReg()))
return 1;
RegDomain OpDomain = getDomain(MRI->getRegClass(MO.getReg()),
@@ -436,7 +436,7 @@ void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg,
if (EnclosedEdges.count(Reg))
return;
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
return;
if (!MRI->hasOneDef(Reg))
@@ -593,8 +593,8 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
if (!DefOp.isReg())
continue;
- unsigned DefReg = DefOp.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(DefReg)) {
+ Register DefReg = DefOp.getReg();
+ if (!Register::isVirtualRegister(DefReg)) {
C.setAllIllegal();
continue;
}
@@ -751,7 +751,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
// Go over all virtual registers and calculate a closure.
unsigned ClosureID = 0;
for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) {
- unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx);
+ unsigned Reg = Register::index2VirtReg(Idx);
// GPR only current source domain supported.
if (!isGPR(MRI->getRegClass(Reg)))
diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp
index 58680f1815bb..24c8e6d6f6eb 100755
--- a/lib/Target/X86/X86EvexToVex.cpp
+++ b/lib/Target/X86/X86EvexToVex.cpp
@@ -131,7 +131,7 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
assert(!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31) &&
"ZMM instructions should not be in the EVEX->VEX tables");
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index b8624b40f2f7..9126a1fbea52 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -194,7 +194,8 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case X86::TCRETURNmi64: {
bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
MachineOperand &JumpTarget = MBBI->getOperand(0);
- MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
+ MachineOperand &StackAdjust = MBBI->getOperand(isMem ? X86::AddrNumOperands
+ : 1);
assert(StackAdjust.isImm() && "Expecting immediate value.");
// Adjust stack pointer.
@@ -259,7 +260,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
? X86::TAILJMPm
: (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
- for (unsigned i = 0; i != 5; ++i)
+ for (unsigned i = 0; i != X86::AddrNumOperands; ++i)
MIB.add(MBBI->getOperand(i));
} else if (Opcode == X86::TCRETURNri64) {
JumpTarget.setIsKill();
@@ -274,7 +275,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MachineInstr &NewMI = *std::prev(MBBI);
NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
- MBB.getParent()->updateCallSiteInfo(&*MBBI, &NewMI);
+ MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI);
// Delete the pseudo instruction TCRETURN.
MBB.erase(MBBI);
@@ -287,7 +288,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
assert(DestAddr.isReg() && "Offset should be in register!");
const bool Uses64BitFramePtr =
STI->isTarget64BitLP64() || STI->isTargetNaCl64();
- unsigned StackPtr = TRI->getStackRegister();
+ Register StackPtr = TRI->getStackRegister();
BuildMI(MBB, MBBI, DL,
TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr)
.addReg(DestAddr.getReg());
@@ -347,7 +348,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// actualcmpxchg Addr
// [E|R]BX = SaveRbx
const MachineOperand &InArg = MBBI->getOperand(6);
- unsigned SaveRbx = MBBI->getOperand(7).getReg();
+ Register SaveRbx = MBBI->getOperand(7).getReg();
unsigned ActualInArg =
Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 7b9ce0271205..e5e089d07d55 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1160,6 +1160,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
CallingConv::ID CC = F.getCallingConv();
if (CC != CallingConv::C &&
CC != CallingConv::Fast &&
+ CC != CallingConv::Tail &&
CC != CallingConv::X86_FastCall &&
CC != CallingConv::X86_StdCall &&
CC != CallingConv::X86_ThisCall &&
@@ -1173,7 +1174,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
// fastcc with -tailcallopt is intended to provide a guaranteed
// tail call optimization. Fastisel doesn't know how to do that.
- if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+ if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
+ CC == CallingConv::Tail)
return false;
// Let SDISel handle vararg functions.
@@ -1241,7 +1243,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
}
// Make the copy.
- unsigned DstReg = VA.getLocReg();
+ Register DstReg = VA.getLocReg();
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
// Avoid a cross-class copy. This is very unlikely.
if (!SrcRC->contains(DstReg))
@@ -3157,7 +3159,7 @@ static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
if (Subtarget->getTargetTriple().isOSMSVCRT())
return 0;
if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
- CC == CallingConv::HiPE)
+ CC == CallingConv::HiPE || CC == CallingConv::Tail)
return 0;
if (CS)
@@ -3208,6 +3210,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
default: return false;
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::Tail:
case CallingConv::WebKit_JS:
case CallingConv::Swift:
case CallingConv::X86_FastCall:
@@ -3224,7 +3227,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// fastcc with -tailcallopt is intended to provide a guaranteed
// tail call optimization. Fastisel doesn't know how to do that.
- if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+ if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
+ CC == CallingConv::Tail)
return false;
// Don't know how to handle Win64 varargs yet. Nothing special needed for
@@ -3387,6 +3391,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
case CCValAssign::SExtUpper:
case CCValAssign::ZExtUpper:
case CCValAssign::FPExt:
+ case CCValAssign::Trunc:
llvm_unreachable("Unexpected loc info!");
case CCValAssign::Indirect:
// FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
@@ -3547,7 +3552,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
CCValAssign &VA = RVLocs[i];
EVT CopyVT = VA.getValVT();
unsigned CopyReg = ResultReg + i;
- unsigned SrcReg = VA.getLocReg();
+ Register SrcReg = VA.getLocReg();
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index bf541d933790..9f7c4afde760 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -80,7 +80,7 @@ class FixupBWInstPass : public MachineFunctionPass {
/// destination register of the MachineInstr passed in. It returns true if
/// that super register is dead just prior to \p OrigMI, and false if not.
bool getSuperRegDestIfDead(MachineInstr *OrigMI,
- unsigned &SuperDestReg) const;
+ Register &SuperDestReg) const;
/// Change the MachineInstr \p MI into the equivalent extending load to 32 bit
/// register if it is safe to do so. Return the replacement instruction if
@@ -92,6 +92,12 @@ class FixupBWInstPass : public MachineFunctionPass {
/// nullptr.
MachineInstr *tryReplaceCopy(MachineInstr *MI) const;
+ /// Change the MachineInstr \p MI into the equivalent extend to 32 bit
+ /// register if it is safe to do so. Return the replacement instruction if
+ /// OK, otherwise return nullptr.
+ MachineInstr *tryReplaceExtend(unsigned New32BitOpcode,
+ MachineInstr *MI) const;
+
// Change the MachineInstr \p MI into an eqivalent 32 bit instruction if
// possible. Return the replacement instruction if OK, return nullptr
// otherwise.
@@ -169,10 +175,10 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
///
/// If so, return that super register in \p SuperDestReg.
bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
- unsigned &SuperDestReg) const {
+ Register &SuperDestReg) const {
auto *TRI = &TII->getRegisterInfo();
- unsigned OrigDestReg = OrigMI->getOperand(0).getReg();
+ Register OrigDestReg = OrigMI->getOperand(0).getReg();
SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32);
const auto SubRegIdx = TRI->getSubRegIndex(SuperDestReg, OrigDestReg);
@@ -232,12 +238,12 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
// %ax = KILL %ax, implicit killed %eax
// RET 0, %ax
unsigned Opc = OrigMI->getOpcode(); (void)Opc;
- // These are the opcodes currently handled by the pass, if something
- // else will be added we need to ensure that new opcode has the same
- // properties.
- assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr ||
- Opc == X86::MOV16rr) &&
- "Unexpected opcode.");
+ // These are the opcodes currently known to work with the code below, if
+ // something // else will be added we need to ensure that new opcode has the
+ // same properties.
+ if (Opc != X86::MOV8rm && Opc != X86::MOV16rm && Opc != X86::MOV8rr &&
+ Opc != X86::MOV16rr)
+ return false;
bool IsDefined = false;
for (auto &MO: OrigMI->implicit_operands()) {
@@ -247,7 +253,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!");
if (MO.isDef() && TRI->isSuperRegisterEq(OrigDestReg, MO.getReg()))
- IsDefined = true;
+ IsDefined = true;
// If MO is a use of any part of the destination register but is not equal
// to OrigDestReg or one of its subregisters, we cannot use SuperDestReg.
@@ -268,7 +274,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode,
MachineInstr *MI) const {
- unsigned NewDestReg;
+ Register NewDestReg;
// We are going to try to rewrite this load to a larger zero-extending
// load. This is safe if all portions of the 32 bit super-register
@@ -295,11 +301,11 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
auto &OldDest = MI->getOperand(0);
auto &OldSrc = MI->getOperand(1);
- unsigned NewDestReg;
+ Register NewDestReg;
if (!getSuperRegDestIfDead(MI, NewDestReg))
return nullptr;
- unsigned NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32);
+ Register NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32);
// This is only correct if we access the same subregister index: otherwise,
// we could try to replace "movb %ah, %al" with "movl %eax, %eax".
@@ -326,6 +332,33 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
return MIB;
}
+MachineInstr *FixupBWInstPass::tryReplaceExtend(unsigned New32BitOpcode,
+ MachineInstr *MI) const {
+ Register NewDestReg;
+ if (!getSuperRegDestIfDead(MI, NewDestReg))
+ return nullptr;
+
+ // Don't interfere with formation of CBW instructions which should be a
+ // shorter encoding than even the MOVSX32rr8. It's also immunte to partial
+ // merge issues on Intel CPUs.
+ if (MI->getOpcode() == X86::MOVSX16rr8 &&
+ MI->getOperand(0).getReg() == X86::AX &&
+ MI->getOperand(1).getReg() == X86::AL)
+ return nullptr;
+
+ // Safe to change the instruction.
+ MachineInstrBuilder MIB =
+ BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg);
+
+ unsigned NumArgs = MI->getNumOperands();
+ for (unsigned i = 1; i < NumArgs; ++i)
+ MIB.add(MI->getOperand(i));
+
+ MIB.setMemRefs(MI->memoperands());
+
+ return MIB;
+}
+
MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
MachineBasicBlock &MBB) const {
// See if this is an instruction of the type we are currently looking for.
@@ -355,6 +388,15 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
// of the register.
return tryReplaceCopy(MI);
+ case X86::MOVSX16rr8:
+ return tryReplaceExtend(X86::MOVSX32rr8, MI);
+ case X86::MOVSX16rm8:
+ return tryReplaceExtend(X86::MOVSX32rm8, MI);
+ case X86::MOVZX16rr8:
+ return tryReplaceExtend(X86::MOVZX32rr8, MI);
+ case X86::MOVZX16rm8:
+ return tryReplaceExtend(X86::MOVZX32rm8, MI);
+
default:
// nothing to do here.
break;
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 041529a0be68..543dc8b00fa0 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -67,8 +67,8 @@ class FixupLEAPass : public MachineFunctionPass {
/// - LEA that uses RIP relative addressing mode
/// - LEA that uses 16-bit addressing mode "
/// This function currently handles the first 2 cases only.
- MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
- MachineBasicBlock &MBB);
+ void processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB, bool OptIncDec);
/// Look for LEAs that are really two address LEAs that we might be able to
/// turn into regular ADD instructions.
@@ -216,14 +216,10 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) {
if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP))
continue;
- if (IsSlowLEA) {
+ if (IsSlowLEA)
processInstructionForSlowLEA(I, MBB);
- } else if (IsSlow3OpsLEA) {
- if (auto *NewMI = processInstrForSlow3OpLEA(*I, MBB)) {
- MBB.erase(I);
- I = NewMI;
- }
- }
+ else if (IsSlow3OpsLEA)
+ processInstrForSlow3OpLEA(I, MBB, OptIncDec);
}
// Second pass for creating LEAs. This may reverse some of the
@@ -301,18 +297,14 @@ static inline bool isInefficientLEAReg(unsigned Reg) {
Reg == X86::R13D || Reg == X86::R13;
}
-static inline bool isRegOperand(const MachineOperand &Op) {
- return Op.isReg() && Op.getReg() != X86::NoRegister;
-}
-
/// Returns true if this LEA uses base an index registers, and the base register
/// is known to be inefficient for the subtarget.
// TODO: use a variant scheduling class to model the latency profile
// of LEA instructions, and implement this logic as a scheduling predicate.
static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
const MachineOperand &Index) {
- return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
- isRegOperand(Index);
+ return Base.isReg() && isInefficientLEAReg(Base.getReg()) && Index.isReg() &&
+ Index.getReg() != X86::NoRegister;
}
static inline bool hasLEAOffset(const MachineOperand &Offset) {
@@ -372,9 +364,9 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
!TII->isSafeToClobberEFLAGS(MBB, I))
return false;
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned BaseReg = Base.getReg();
- unsigned IndexReg = Index.getReg();
+ Register DestReg = MI.getOperand(0).getReg();
+ Register BaseReg = Base.getReg();
+ Register IndexReg = Index.getReg();
// Don't change stack adjustment LEAs.
if (UseLEAForSP && (DestReg == X86::ESP || DestReg == X86::RSP))
@@ -500,9 +492,9 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
if (Segment.getReg() != 0 || !Offset.isImm() ||
!TII->isSafeToClobberEFLAGS(MBB, I))
return;
- const unsigned DstR = Dst.getReg();
- const unsigned SrcR1 = Base.getReg();
- const unsigned SrcR2 = Index.getReg();
+ const Register DstR = Dst.getReg();
+ const Register SrcR1 = Base.getReg();
+ const Register SrcR2 = Index.getReg();
if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
return;
if (Scale.getImm() > 1)
@@ -534,111 +526,150 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
}
}
-MachineInstr *
-FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
- MachineBasicBlock &MBB) {
+void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB,
+ bool OptIncDec) {
+ MachineInstr &MI = *I;
const unsigned LEAOpcode = MI.getOpcode();
- const MachineOperand &Dst = MI.getOperand(0);
+ const MachineOperand &Dest = MI.getOperand(0);
const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg);
const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt);
const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg);
const MachineOperand &Offset = MI.getOperand(1 + X86::AddrDisp);
const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
- if (!(TII->isThreeOperandsLEA(MI) ||
- hasInefficientLEABaseReg(Base, Index)) ||
+ if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) ||
!TII->isSafeToClobberEFLAGS(MBB, MI) ||
Segment.getReg() != X86::NoRegister)
- return nullptr;
+ return;
+
+ Register DestReg = Dest.getReg();
+ Register BaseReg = Base.getReg();
+ Register IndexReg = Index.getReg();
+
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ if (BaseReg != 0)
+ BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit);
+ if (IndexReg != 0)
+ IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit);
+ }
- unsigned DstR = Dst.getReg();
- unsigned BaseR = Base.getReg();
- unsigned IndexR = Index.getReg();
- unsigned SSDstR =
- (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
bool IsScale1 = Scale.getImm() == 1;
- bool IsInefficientBase = isInefficientLEAReg(BaseR);
- bool IsInefficientIndex = isInefficientLEAReg(IndexR);
+ bool IsInefficientBase = isInefficientLEAReg(BaseReg);
+ bool IsInefficientIndex = isInefficientLEAReg(IndexReg);
// Skip these cases since it takes more than 2 instructions
// to replace the LEA instruction.
- if (IsInefficientBase && SSDstR == BaseR && !IsScale1)
- return nullptr;
- if (LEAOpcode == X86::LEA64_32r && IsInefficientBase &&
- (IsInefficientIndex || !IsScale1))
- return nullptr;
-
- const DebugLoc DL = MI.getDebugLoc();
- const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
- const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
+ if (IsInefficientBase && DestReg == BaseReg && !IsScale1)
+ return;
LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+ MachineInstr *NewMI = nullptr;
+
// First try to replace LEA with one or two (for the 3-op LEA case)
// add instructions:
// 1.lea (%base,%index,1), %base => add %index,%base
// 2.lea (%base,%index,1), %index => add %base,%index
- if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
- const MachineOperand &Src = DstR == BaseR ? Index : Base;
- MachineInstr *NewMI =
- BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
- LLVM_DEBUG(NewMI->dump(););
- // Create ADD instruction for the Offset in case of 3-Ops LEA.
- if (hasLEAOffset(Offset)) {
- NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
- LLVM_DEBUG(NewMI->dump(););
+ if (IsScale1 && (DestReg == BaseReg || DestReg == IndexReg)) {
+ unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+ if (DestReg != BaseReg)
+ std::swap(BaseReg, IndexReg);
+
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ // TODO: Do we need the super register implicit use?
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(BaseReg)
+ .addReg(IndexReg)
+ .addReg(Base.getReg(), RegState::Implicit)
+ .addReg(Index.getReg(), RegState::Implicit);
+ } else {
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(BaseReg)
+ .addReg(IndexReg);
}
- return NewMI;
- }
- // If the base is inefficient try switching the index and base operands,
- // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
- // lea offset(%base,%index,scale),%dst =>
- // lea (%base,%index,scale); add offset,%dst
- if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
- MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode))
- .add(Dst)
- .add(IsInefficientBase ? Index : Base)
- .add(Scale)
- .add(IsInefficientBase ? Base : Index)
- .addImm(0)
- .add(Segment);
+ } else if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
+ // If the base is inefficient try switching the index and base operands,
+ // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
+ // lea offset(%base,%index,scale),%dst =>
+ // lea (%base,%index,scale); add offset,%dst
+ NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode))
+ .add(Dest)
+ .add(IsInefficientBase ? Index : Base)
+ .add(Scale)
+ .add(IsInefficientBase ? Base : Index)
+ .addImm(0)
+ .add(Segment);
LLVM_DEBUG(NewMI->dump(););
+ }
+
+ // If either replacement succeeded above, add the offset if needed, then
+ // replace the instruction.
+ if (NewMI) {
// Create ADD instruction for the Offset in case of 3-Ops LEA.
if (hasLEAOffset(Offset)) {
- NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
- LLVM_DEBUG(NewMI->dump(););
+ if (OptIncDec && Offset.isImm() &&
+ (Offset.getImm() == 1 || Offset.getImm() == -1)) {
+ unsigned NewOpc =
+ getINCDECFromLEA(MI.getOpcode(), Offset.getImm() == 1);
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(DestReg);
+ LLVM_DEBUG(NewMI->dump(););
+ } else {
+ unsigned NewOpc = getADDriFromLEA(MI.getOpcode(), Offset);
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(DestReg)
+ .add(Offset);
+ LLVM_DEBUG(NewMI->dump(););
+ }
}
- return NewMI;
+
+ MBB.erase(I);
+ I = NewMI;
+ return;
}
+
// Handle the rest of the cases with inefficient base register:
- assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!");
+ assert(DestReg != BaseReg && "DestReg == BaseReg should be handled already!");
assert(IsInefficientBase && "efficient base should be handled already!");
+ // FIXME: Handle LEA64_32r.
+ if (LEAOpcode == X86::LEA64_32r)
+ return;
+
// lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
if (IsScale1 && !hasLEAOffset(Offset)) {
- bool BIK = Base.isKill() && BaseR != IndexR;
- TII->copyPhysReg(MBB, MI, DL, DstR, BaseR, BIK);
+ bool BIK = Base.isKill() && BaseReg != IndexReg;
+ TII->copyPhysReg(MBB, MI, MI.getDebugLoc(), DestReg, BaseReg, BIK);
LLVM_DEBUG(MI.getPrevNode()->dump(););
- MachineInstr *NewMI =
- BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+ unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+ NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(DestReg)
+ .add(Index);
LLVM_DEBUG(NewMI->dump(););
- return NewMI;
+ return;
}
+
// lea offset(%base,%index,scale), %dst =>
// lea offset( ,%index,scale), %dst; add %base,%dst
- MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode))
- .add(Dst)
- .addReg(0)
- .add(Scale)
- .add(Index)
- .add(Offset)
- .add(Segment);
+ NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode))
+ .add(Dest)
+ .addReg(0)
+ .add(Scale)
+ .add(Index)
+ .add(Offset)
+ .add(Segment);
LLVM_DEBUG(NewMI->dump(););
- NewMI = BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+ unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+ NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(DestReg)
+ .add(Base);
LLVM_DEBUG(NewMI->dump(););
- return NewMI;
+
+ MBB.erase(I);
+ I = NewMI;
}
diff --git a/lib/Target/X86/X86FixupSetCC.cpp b/lib/Target/X86/X86FixupSetCC.cpp
index e2d4d1ede6f3..cbde280aa280 100644
--- a/lib/Target/X86/X86FixupSetCC.cpp
+++ b/lib/Target/X86/X86FixupSetCC.cpp
@@ -136,8 +136,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
? &X86::GR32RegClass
: &X86::GR32_ABCDRegClass;
- unsigned ZeroReg = MRI->createVirtualRegister(RC);
- unsigned InsertReg = MRI->createVirtualRegister(RC);
+ Register ZeroReg = MRI->createVirtualRegister(RC);
+ Register InsertReg = MRI->createVirtualRegister(RC);
// Initialize a register with 0. This must go before the eflags def
BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
diff --git a/lib/Target/X86/X86FlagsCopyLowering.cpp b/lib/Target/X86/X86FlagsCopyLowering.cpp
index 5ce3255ea96a..cfba06fb6533 100644
--- a/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -721,8 +721,9 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
for (MachineInstr &MI :
llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
X86::CondCode Cond = X86::getCondFromSETCC(MI);
- if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() &&
- TRI->isVirtualRegister(MI.getOperand(0).getReg())) {
+ if (Cond != X86::COND_INVALID && !MI.mayStore() &&
+ MI.getOperand(0).isReg() &&
+ Register::isVirtualRegister(MI.getOperand(0).getReg())) {
assert(MI.getOperand(0).isDef() &&
"A non-storing SETcc should always define a register!");
CondRegs[Cond] = MI.getOperand(0).getReg();
@@ -739,7 +740,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
DebugLoc TestLoc, X86::CondCode Cond) {
- unsigned Reg = MRI->createVirtualRegister(PromoteRC);
+ Register Reg = MRI->createVirtualRegister(PromoteRC);
auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
TII->get(X86::SETCCr), Reg).addImm(Cond);
(void)SetI;
@@ -813,7 +814,7 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic(
MachineBasicBlock &MBB = *MI.getParent();
// Insert an instruction that will set the flag back to the desired value.
- unsigned TmpReg = MRI->createVirtualRegister(PromoteRC);
+ Register TmpReg = MRI->createVirtualRegister(PromoteRC);
auto AddI =
BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri))
.addDef(TmpReg, RegState::Dead)
@@ -974,7 +975,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
// Now we need to turn this into a bitmask. We do this by subtracting it from
// zero.
- unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ Register ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
ZeroReg = AdjustReg(ZeroReg);
@@ -999,7 +1000,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
default:
llvm_unreachable("Invalid SETB_C* opcode!");
}
- unsigned ResultReg = MRI->createVirtualRegister(&SetBRC);
+ Register ResultReg = MRI->createVirtualRegister(&SetBRC);
BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
.addReg(ZeroReg)
.addReg(ExtCondReg);
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 074cf21d03f5..fcfb5bc91314 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -288,8 +288,8 @@ namespace {
// Check if a COPY instruction is using FP registers.
static bool isFPCopy(MachineInstr &MI) {
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
return X86::RFP80RegClass.contains(DstReg) ||
X86::RFP80RegClass.contains(SrcReg);
@@ -313,7 +313,7 @@ FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
/// For example, this returns 3 for X86::FP3.
static unsigned getFPReg(const MachineOperand &MO) {
assert(MO.isReg() && "Expected an FP register!");
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
return Reg - X86::FP0;
}
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index e310fe069117..1b469a814adc 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -35,8 +35,8 @@
using namespace llvm;
X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
- unsigned StackAlignOverride)
- : TargetFrameLowering(StackGrowsDown, StackAlignOverride,
+ MaybeAlign StackAlignOverride)
+ : TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(),
STI.is64Bit() ? -8 : -4),
STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
// Cache a bunch of frame-related predicates for this subtarget.
@@ -176,7 +176,7 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
MachineOperand &MO = MBBI->getOperand(i);
if (!MO.isReg() || MO.isDef())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (!Reg)
continue;
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
@@ -216,7 +216,7 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg != X86::EFLAGS)
continue;
@@ -995,11 +995,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO;
bool NeedsDwarfCFI =
!IsWin64Prologue && (MMI.hasDebugInfo() || Fn.needsUnwindTableEntry());
- unsigned FramePtr = TRI->getFrameRegister(MF);
- const unsigned MachineFramePtr =
+ Register FramePtr = TRI->getFrameRegister(MF);
+ const Register MachineFramePtr =
STI.isTarget64BitILP32()
- ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
- unsigned BasePtr = TRI->getBaseRegister();
+ ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
+ Register BasePtr = TRI->getBaseRegister();
bool HasWinCFI = false;
// Debug location must be unknown since the first debug location is used
@@ -1016,14 +1016,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();
-
- // The default stack probe size is 4096 if the function has no stackprobesize
- // attribute.
- unsigned StackProbeSize = 4096;
- if (Fn.hasFnAttribute("stack-probe-size"))
- Fn.getFnAttribute("stack-probe-size")
- .getValueAsString()
- .getAsInteger(0, StackProbeSize);
+ unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);
// Re-align the stack on 64-bit if the x86-interrupt calling convention is
// used and an error code was pushed, since the x86-64 ABI requires a 16-byte
@@ -1081,7 +1074,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
int stackGrowth = -SlotSize;
// Find the funclet establisher parameter
- unsigned Establisher = X86::NoRegister;
+ Register Establisher = X86::NoRegister;
if (IsClrFunclet)
Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
else if (IsFunclet)
@@ -1192,7 +1185,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
(MBBI->getOpcode() == X86::PUSH32r ||
MBBI->getOpcode() == X86::PUSH64r)) {
PushedRegs = true;
- unsigned Reg = MBBI->getOperand(0).getReg();
+ Register Reg = MBBI->getOperand(0).getReg();
++MBBI;
if (!HasFP && NeedsDwarfCFI) {
@@ -1396,9 +1389,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
int FI;
if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
if (X86::FR64RegClass.contains(Reg)) {
+ int Offset;
unsigned IgnoredFrameReg;
- int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
- Offset += SEHFrameOffset;
+ if (IsWin64Prologue && IsFunclet)
+ Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
+ else
+ Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg) +
+ SEHFrameOffset;
HasWinCFI = true;
assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
@@ -1554,9 +1551,13 @@ X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
unsigned
X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
// This is the size of the pushed CSRs.
- unsigned CSSize =
- MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ // This is the size of callee saved XMMs.
+ const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+ unsigned XMMSize = WinEHXMMSlotInfo.size() *
+ TRI->getSpillSize(X86::VR128RegClass);
// This is the amount of stack a funclet needs to allocate.
unsigned UsedSize;
EHPersonality Personality =
@@ -1576,7 +1577,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment());
// Subtract out the size of the callee saved registers. This is how much stack
// each funclet will allocate.
- return FrameSizeMinusRBP - CSSize;
+ return FrameSizeMinusRBP + XMMSize - CSSize;
}
static bool isTailCallOpcode(unsigned Opc) {
@@ -1597,9 +1598,9 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
DL = MBBI->getDebugLoc();
// standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
const bool Is64BitILP32 = STI.isTarget64BitILP32();
- unsigned FramePtr = TRI->getFrameRegister(MF);
+ Register FramePtr = TRI->getFrameRegister(MF);
unsigned MachineFramePtr =
- Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
+ Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
bool NeedsWin64CFI =
@@ -1850,6 +1851,20 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
return Offset + FPDelta;
}
+int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF,
+ int FI, unsigned &FrameReg) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+ const auto it = WinEHXMMSlotInfo.find(FI);
+
+ if (it == WinEHXMMSlotInfo.end())
+ return getFrameIndexReference(MF, FI, FrameReg);
+
+ FrameReg = TRI->getStackRegister();
+ return alignTo(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second;
+}
+
int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
int FI, unsigned &FrameReg,
int Adjustment) const {
@@ -1948,6 +1963,8 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
unsigned CalleeSavedFrameSize = 0;
+ unsigned XMMCalleeSavedFrameSize = 0;
+ auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
@@ -1984,7 +2001,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
// Since emitPrologue and emitEpilogue will handle spilling and restoring of
// the frame register, we can delete it from CSI list and not have to worry
// about avoiding it later.
- unsigned FPReg = TRI->getFrameRegister(MF);
+ Register FPReg = TRI->getFrameRegister(MF);
for (unsigned i = 0; i < CSI.size(); ++i) {
if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
CSI.erase(CSI.begin() + i);
@@ -2025,12 +2042,20 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
unsigned Size = TRI->getSpillSize(*RC);
unsigned Align = TRI->getSpillAlignment(*RC);
// ensure alignment
- SpillSlotOffset -= std::abs(SpillSlotOffset) % Align;
+ assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86");
+ SpillSlotOffset = -alignTo(-SpillSlotOffset, Align);
+
// spill into slot
SpillSlotOffset -= Size;
int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
CSI[i - 1].setFrameIdx(SlotIndex);
MFI.ensureMaxAlignment(Align);
+
+ // Save the start offset and size of XMM in stack frame for funclets.
+ if (X86::VR128RegClass.contains(Reg)) {
+ WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize;
+ XMMCalleeSavedFrameSize += Size;
+ }
}
return true;
@@ -2200,7 +2225,7 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
// Spill the BasePtr if it's used.
if (TRI->hasBasePointer(MF)){
- unsigned BasePtr = TRI->getBaseRegister();
+ Register BasePtr = TRI->getBaseRegister();
if (STI.isTarget64BitILP32())
BasePtr = getX86SubSuperRegister(BasePtr, 64);
SavedRegs.set(BasePtr);
@@ -2212,7 +2237,7 @@ HasNestArgument(const MachineFunction *MF) {
const Function &F = MF->getFunction();
for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
I != E; I++) {
- if (I->hasNestAttr())
+ if (I->hasNestAttr() && !I->use_empty())
return true;
}
return false;
@@ -2244,7 +2269,8 @@ GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Pr
bool IsNested = HasNestArgument(&MF);
if (CallingConvention == CallingConv::X86_FastCall ||
- CallingConvention == CallingConv::Fast) {
+ CallingConvention == CallingConv::Fast ||
+ CallingConvention == CallingConv::Tail) {
if (IsNested)
report_fatal_error("Segmented stacks does not support fastcall with "
"nested function.");
@@ -2525,6 +2551,18 @@ static unsigned getHiPELiteral(
+ " required but not provided");
}
+// Return true if there are no non-ehpad successors to MBB and there are no
+// non-meta instructions between MBBI and MBB.end().
+static bool blockEndIsUnreachable(const MachineBasicBlock &MBB,
+ MachineBasicBlock::const_iterator MBBI) {
+ return std::all_of(
+ MBB.succ_begin(), MBB.succ_end(),
+ [](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) &&
+ std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) {
+ return MI.isMetaInstruction();
+ });
+}
+
/// Erlang programs may need a special prologue to handle the stack size they
/// might need at runtime. That is because Erlang/OTP does not implement a C
/// stack but uses a custom implementation of hybrid stack/heap architecture.
@@ -2758,7 +2796,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
unsigned Opcode = I->getOpcode();
bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
DebugLoc DL = I->getDebugLoc();
- uint64_t Amount = !reserveCallFrame ? TII.getFrameSize(*I) : 0;
+ uint64_t Amount = TII.getFrameSize(*I);
uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0;
I = MBB.erase(I);
auto InsertPos = skipDebugInstructionsForward(I, MBB.end());
@@ -2847,7 +2885,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
return I;
}
- if (isDestroy && InternalAmt) {
+ if (isDestroy && InternalAmt && !blockEndIsUnreachable(MBB, I)) {
// If we are performing frame pointer elimination and if the callee pops
// something off the stack pointer, add it back. We do this until we have
// more advanced stack pointer tracking ability.
@@ -2912,8 +2950,8 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
"restoring EBP/ESI on non-32-bit target");
MachineFunction &MF = *MBB.getParent();
- unsigned FramePtr = TRI->getFrameRegister(MF);
- unsigned BasePtr = TRI->getBaseRegister();
+ Register FramePtr = TRI->getFrameRegister(MF);
+ Register BasePtr = TRI->getBaseRegister();
WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index d32746e3a36e..2103d6471ead 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -25,7 +25,7 @@ class X86RegisterInfo;
class X86FrameLowering : public TargetFrameLowering {
public:
- X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride);
+ X86FrameLowering(const X86Subtarget &STI, MaybeAlign StackAlignOverride);
// Cached subtarget predicates.
@@ -99,6 +99,8 @@ public:
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
+ int getWin64EHFrameIndexRef(const MachineFunction &MF,
+ int FI, unsigned &SPReg) const;
int getFrameIndexReferenceSP(const MachineFunction &MF,
int FI, unsigned &SPReg, int Adjustment) const;
int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 95d31e62cafc..5b546d42d98a 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -253,6 +253,11 @@ namespace {
return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
}
+ bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+
/// Implement addressing mode selection for inline asm expressions.
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintID,
@@ -362,6 +367,11 @@ namespace {
if (User->getNumOperands() != 2)
continue;
+ // If this can match to INC/DEC, don't count it as a use.
+ if (User->getOpcode() == ISD::ADD &&
+ (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0))))
+ continue;
+
// Immediates that are used for offsets as part of stack
// manipulation should be left alone. These are typically
// used to indicate SP offsets for argument passing and
@@ -502,8 +512,10 @@ namespace {
bool shrinkAndImmediate(SDNode *N);
bool isMaskZeroExtended(SDNode *N) const;
bool tryShiftAmountMod(SDNode *N);
+ bool combineIncDecVector(SDNode *Node);
bool tryShrinkShlLogicImm(SDNode *N);
bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
+ bool tryMatchBitSelect(SDNode *N);
MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
const SDLoc &dl, MVT VT, SDNode *Node);
@@ -746,7 +758,7 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
return false;
LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
if (!LD ||
- LD->isVolatile() ||
+ !LD->isSimple() ||
LD->getAddressingMode() != ISD::UNINDEXED ||
LD->getExtensionType() != ISD::NON_EXTLOAD)
return false;
@@ -873,10 +885,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
case ISD::FRINT: Imm = 0x4; break;
}
SDLoc dl(N);
- SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
- N->getValueType(0),
- N->getOperand(0),
- CurDAG->getConstant(Imm, dl, MVT::i8));
+ SDValue Res = CurDAG->getNode(
+ X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0),
+ CurDAG->getTargetConstant(Imm, dl, MVT::i8));
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
++I;
@@ -2305,10 +2316,10 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
return false;
// We can allow a full vector load here since narrowing a load is ok unless
- // it's volatile.
+ // it's volatile or atomic.
if (ISD::isNON_EXTLoad(N.getNode())) {
LoadSDNode *LD = cast<LoadSDNode>(N);
- if (!LD->isVolatile() &&
+ if (LD->isSimple() &&
IsProfitableToFold(N, LD, Root) &&
IsLegalToFold(N, Parent, Root, OptLevel)) {
PatternNodeWithChain = N;
@@ -2464,6 +2475,37 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
Complexity += 2;
}
+ // Heuristic: try harder to form an LEA from ADD if the operands set flags.
+ // Unlike ADD, LEA does not affect flags, so we will be less likely to require
+ // duplicating flag-producing instructions later in the pipeline.
+ if (N.getOpcode() == ISD::ADD) {
+ auto isMathWithFlags = [](SDValue V) {
+ switch (V.getOpcode()) {
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::ADC:
+ case X86ISD::SBB:
+ /* TODO: These opcodes can be added safely, but we may want to justify
+ their inclusion for different reasons (better for reg-alloc).
+ case X86ISD::SMUL:
+ case X86ISD::UMUL:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ */
+ // Value 1 is the flag output of the node - verify it's not dead.
+ return !SDValue(V.getNode(), 1).use_empty();
+ default:
+ return false;
+ }
+ };
+ // TODO: This could be an 'or' rather than 'and' to make the transform more
+ // likely to happen. We might want to factor in whether there's a
+ // load folding opportunity for the math op that disappears with LEA.
+ if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
+ Complexity++;
+ }
+
if (AM.Disp)
Complexity++;
@@ -2544,6 +2586,7 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
+ assert(Root && P && "Unknown root/parent nodes");
if (!ISD::isNON_EXTLoad(N.getNode()) ||
!IsProfitableToFold(N, P, Root) ||
!IsLegalToFold(N, P, Root, OptLevel))
@@ -2553,6 +2596,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
N.getOperand(1), Base, Scale, Index, Disp, Segment);
}
+bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ assert(Root && P && "Unknown root/parent nodes");
+ if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
+ !IsProfitableToFold(N, P, Root) ||
+ !IsLegalToFold(N, P, Root, OptLevel))
+ return false;
+
+ return selectAddr(N.getNode(),
+ N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
/// Return an SDNode that returns the value of the global base register.
/// Output instructions required to initialize the global base register,
/// if necessary.
@@ -3302,8 +3359,12 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
SDValue ImplDef = SDValue(
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
- NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef,
- NBits);
+
+ SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
+ NBits = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
+ NBits, SRIdxVal), 0);
insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
if (Subtarget->hasBMI2()) {
@@ -3400,8 +3461,9 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
// hoisting the move immediate would make it worthwhile with a less optimal
// BEXTR?
- if (!Subtarget->hasTBM() &&
- !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
+ bool PreferBEXTR =
+ Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
+ if (!PreferBEXTR && !Subtarget->hasBMI2())
return nullptr;
// Must have a shift right.
@@ -3440,23 +3502,50 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
if (Shift + MaskSize > NVT.getSizeInBits())
return nullptr;
- SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
- unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
- unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+ // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
+ // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
+ // does not fit into 32 bits. Load folding is not a sufficient reason.
+ if (!PreferBEXTR && MaskSize <= 32)
+ return nullptr;
- // BMI requires the immediate to placed in a register.
- if (!Subtarget->hasTBM()) {
- ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
- MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+ SDValue Control;
+ unsigned ROpc, MOpc;
+
+ if (!PreferBEXTR) {
+ assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
+ // If we can't make use of BEXTR then we can't fuse shift+mask stages.
+ // Let's perform the mask first, and apply shift later. Note that we need to
+ // widen the mask to account for the fact that we'll apply shift afterwards!
+ Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
+ ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
+ MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
- New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
+ Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
+ } else {
+ // The 'control' of BEXTR has the pattern of:
+ // [15...8 bit][ 7...0 bit] location
+ // [ bit count][ shift] name
+ // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
+ Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
+ if (Subtarget->hasTBM()) {
+ ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
+ MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+ } else {
+ assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
+ // BMI requires the immediate to placed in a register.
+ ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
+ MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+ unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
+ Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
+ }
}
MachineSDNode *NewNode;
SDValue Input = N0->getOperand(0);
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
- SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
+ SDValue Ops[] = {
+ Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
// Update the chain.
@@ -3464,7 +3553,15 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
// Record the mem-refs
CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
} else {
- NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
+ NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
+ }
+
+ if (!PreferBEXTR) {
+ // We still need to apply the shift.
+ SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
+ unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
+ NewNode =
+ CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
}
return NewNode;
@@ -3735,6 +3832,52 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
return true;
}
+/// Convert vector increment or decrement to sub/add with an all-ones constant:
+/// add X, <1, 1...> --> sub X, <-1, -1...>
+/// sub X, <1, 1...> --> add X, <-1, -1...>
+/// The all-ones vector constant can be materialized using a pcmpeq instruction
+/// that is commonly recognized as an idiom (has no register dependency), so
+/// that's better/smaller than loading a splat 1 constant.
+bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) {
+ assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) &&
+ "Unexpected opcode for increment/decrement transform");
+
+ EVT VT = Node->getValueType(0);
+ assert(VT.isVector() && "Should only be called for vectors.");
+
+ SDValue X = Node->getOperand(0);
+ SDValue OneVec = Node->getOperand(1);
+
+ APInt SplatVal;
+ if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue())
+ return false;
+
+ SDLoc DL(Node);
+ SDValue OneConstant, AllOnesVec;
+
+ APInt Ones = APInt::getAllOnesValue(32);
+ assert(VT.getSizeInBits() % 32 == 0 &&
+ "Expected bit count to be a multiple of 32");
+ OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32);
+ insertDAGNode(*CurDAG, X, OneConstant);
+
+ unsigned NumElts = VT.getSizeInBits() / 32;
+ assert(NumElts > 0 && "Expected to get non-empty vector.");
+ AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts),
+ DL, OneConstant);
+ insertDAGNode(*CurDAG, X, AllOnesVec);
+
+ AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec);
+ insertDAGNode(*CurDAG, X, AllOnesVec);
+
+ unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+ SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec);
+
+ ReplaceNode(Node, NewNode.getNode());
+ SelectCode(NewNode.getNode());
+ return true;
+}
+
/// If the high bits of an 'and' operand are known zero, try setting the
/// high bits of an 'and' constant operand to produce a smaller encoding by
/// creating a small, sign-extended negative immediate rather than a large
@@ -3975,12 +4118,18 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
if (CC != ISD::SETEQ && CC != ISD::SETNE)
return false;
- // See if we're comparing against zero. This should have been canonicalized
- // to RHS during lowering.
- if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode()))
+ SDValue SetccOp0 = Setcc.getOperand(0);
+ SDValue SetccOp1 = Setcc.getOperand(1);
+
+ // Canonicalize the all zero vector to the RHS.
+ if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
+ std::swap(SetccOp0, SetccOp1);
+
+ // See if we're comparing against zero.
+ if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
return false;
- SDValue N0 = Setcc.getOperand(0);
+ SDValue N0 = SetccOp0;
MVT CmpVT = N0.getSimpleValueType();
MVT CmpSVT = CmpVT.getVectorElementType();
@@ -4027,13 +4176,14 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
// Look through single use bitcasts.
- if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse())
- Src = Src.getOperand(0);
-
- if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) {
+ if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) {
Parent = Src.getNode();
Src = Src.getOperand(0);
- if (Src.getSimpleValueType() == CmpSVT)
+ }
+
+ if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Src);
+ if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits())
return Src;
}
@@ -4045,17 +4195,18 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
bool FoldedBCast = false;
if (!FoldedLoad && CanFoldLoads &&
(CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
- SDNode *ParentNode = nullptr;
+ SDNode *ParentNode = N0.getNode();
if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
- FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
- Tmp1, Tmp2, Tmp3, Tmp4);
+ FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
+ Tmp1, Tmp2, Tmp3, Tmp4);
}
// Try the other operand.
if (!FoldedBCast) {
+ SDNode *ParentNode = N0.getNode();
if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
- FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
- Tmp1, Tmp2, Tmp3, Tmp4);
+ FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
+ Tmp1, Tmp2, Tmp3, Tmp4);
if (FoldedBCast)
std::swap(Src0, Src1);
}
@@ -4125,7 +4276,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
// Update the chain.
ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
// Record the mem-refs
- CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+ CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()});
} else {
if (IsMasked)
CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
@@ -4146,6 +4297,55 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
return true;
}
+// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
+// into vpternlog.
+bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
+ assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
+
+ MVT NVT = N->getSimpleValueType(0);
+
+ // Make sure we support VPTERNLOG.
+ if (!NVT.isVector() || !Subtarget->hasAVX512())
+ return false;
+
+ // We need VLX for 128/256-bit.
+ if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
+ return false;
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Canonicalize AND to LHS.
+ if (N1.getOpcode() == ISD::AND)
+ std::swap(N0, N1);
+
+ if (N0.getOpcode() != ISD::AND ||
+ N1.getOpcode() != X86ISD::ANDNP ||
+ !N0.hasOneUse() || !N1.hasOneUse())
+ return false;
+
+ // ANDN is not commutable, use it to pick down A and C.
+ SDValue A = N1.getOperand(0);
+ SDValue C = N1.getOperand(1);
+
+ // AND is commutable, if one operand matches A, the other operand is B.
+ // Otherwise this isn't a match.
+ SDValue B;
+ if (N0.getOperand(0) == A)
+ B = N0.getOperand(1);
+ else if (N0.getOperand(1) == A)
+ B = N0.getOperand(0);
+ else
+ return false;
+
+ SDLoc dl(N);
+ SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
+ SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
+ ReplaceNode(N, Ternlog.getNode());
+ SelectCode(Ternlog.getNode());
+ return true;
+}
+
void X86DAGToDAGISel::Select(SDNode *Node) {
MVT NVT = Node->getSimpleValueType(0);
unsigned Opcode = Node->getOpcode();
@@ -4170,6 +4370,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
unsigned Opc = 0;
switch (IntNo) {
+ default: llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::x86_sse3_monitor:
if (!Subtarget->hasSSE3())
break;
@@ -4303,9 +4504,16 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
if (tryShrinkShlLogicImm(Node))
return;
+ if (Opcode == ISD::OR && tryMatchBitSelect(Node))
+ return;
+
LLVM_FALLTHROUGH;
case ISD::ADD:
case ISD::SUB: {
+ if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() &&
+ combineIncDecVector(Node))
+ return;
+
// Try to avoid folding immediates with multiple uses for optsize.
// This code tries to select to register form directly to avoid going
// through the isel table which might fold the immediate. We can't change
@@ -4333,6 +4541,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
if (!isInt<8>(Val) && !isInt<32>(Val))
break;
+ // If this can match to INC/DEC, let it go.
+ if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
+ break;
+
// Check if we should avoid folding this immediate.
if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
break;
@@ -4610,7 +4822,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i8:
LoReg = X86::AL; ClrReg = HiReg = X86::AH;
- SExtOpcode = X86::CBW;
+ SExtOpcode = 0; // Not used.
break;
case MVT::i16:
LoReg = X86::AX; HiReg = X86::DX;
@@ -4632,24 +4844,27 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
bool signBitIsZero = CurDAG->SignBitIsZero(N0);
SDValue InFlag;
- if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+ if (NVT == MVT::i8) {
// Special case for div8, just use a move with zero extension to AX to
// clear the upper 8 bits (AH).
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
MachineSDNode *Move;
if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
- Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
- MVT::Other, Ops);
+ unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
+ : X86::MOVZX16rm8;
+ Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
Chain = SDValue(Move, 1);
ReplaceUses(N0.getValue(1), Chain);
// Record the mem-refs
CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
} else {
- Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0);
+ unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
+ : X86::MOVZX16rr8;
+ Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
Chain = CurDAG->getEntryNode();
}
- Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0),
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
SDValue());
InFlag = Chain.getValue(1);
} else {
@@ -4996,10 +5211,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
case ISD::FRINT: Imm = 0x4; break;
}
SDLoc dl(Node);
- SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
- Node->getValueType(0),
+ SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, Node->getValueType(0),
Node->getOperand(0),
- CurDAG->getConstant(Imm, dl, MVT::i8));
+ CurDAG->getTargetConstant(Imm, dl, MVT::i8));
ReplaceNode(Node, Res.getNode());
SelectCode(Res.getNode());
return;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0b4bf687e6cf..ed975e9248a8 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -65,17 +65,19 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
-static cl::opt<bool> ExperimentalVectorWideningLegalization(
- "x86-experimental-vector-widening-legalization", cl::init(false),
- cl::desc("Enable an experimental vector type legalization through widening "
- "rather than promotion."),
- cl::Hidden);
-
static cl::opt<int> ExperimentalPrefLoopAlignment(
"x86-experimental-pref-loop-alignment", cl::init(4),
- cl::desc("Sets the preferable loop alignment for experiments "
- "(the last x86-experimental-pref-loop-alignment bits"
- " of the loop header PC will be 0)."),
+ cl::desc(
+ "Sets the preferable loop alignment for experiments (as log2 bytes)"
+ "(the last x86-experimental-pref-loop-alignment bits"
+ " of the loop header PC will be 0)."),
+ cl::Hidden);
+
+// Added in 10.0.
+static cl::opt<bool> EnableOldKNLABI(
+ "x86-enable-old-knl-abi", cl::init(false),
+ cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
+ "one ZMM register on AVX512F, but not AVX512BW targets."),
cl::Hidden);
static cl::opt<bool> MulConstantOptimization(
@@ -84,6 +86,13 @@ static cl::opt<bool> MulConstantOptimization(
"SHIFT, LEA, etc."),
cl::Hidden);
+static cl::opt<bool> ExperimentalUnorderedISEL(
+ "x86-experimental-unordered-atomic-isel", cl::init(false),
+ cl::desc("Use LoadSDNode and StoreSDNode instead of "
+ "AtomicSDNode for unordered atomic loads and "
+ "stores respectively."),
+ cl::Hidden);
+
/// Call this when the user attempts to do something unsupported, like
/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
/// report_fatal_error, so calling code should attempt to recover without
@@ -196,7 +205,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Integer absolute.
if (Subtarget.hasCMov()) {
setOperationAction(ISD::ABS , MVT::i16 , Custom);
- setOperationAction(ISD::ABS , MVT::i32 , Custom);
+ setOperationAction(ISD::ABS , MVT::i32 , Custom);
}
setOperationAction(ISD::ABS , MVT::i64 , Custom);
@@ -214,14 +223,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
- if (Subtarget.is64Bit()) {
- if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
- // f32/f64 are legal, f80 is custom.
- setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
- else
- setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
- setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
- } else if (!Subtarget.useSoftFloat()) {
+ if (!Subtarget.useSoftFloat()) {
// We have an algorithm for SSE2->double, and we turn this into a
// 64-bit FILD followed by conditional FADD for other targets.
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
@@ -277,29 +279,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
- if (Subtarget.is64Bit()) {
- if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
- // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
- setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
- setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
- } else {
- setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
- setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
- }
- } else if (!Subtarget.useSoftFloat()) {
- // Since AVX is a superset of SSE3, only check for SSE here.
- if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
- // Expand FP_TO_UINT into a select.
- // FIXME: We would like to use a Custom expander here eventually to do
- // the optimal thing for SSE vs. the default expansion in the legalizer.
- setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
- else
- // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
- // With SSE3 we can use fisttpll to convert to a signed i64; without
- // SSE, we're stuck with a fistpll.
- setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
-
- setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
+ if (!Subtarget.useSoftFloat()) {
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
}
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
@@ -345,11 +327,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
- setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
setOperationAction(ISD::FREM , MVT::f32 , Expand);
setOperationAction(ISD::FREM , MVT::f64 , Expand);
setOperationAction(ISD::FREM , MVT::f80 , Expand);
+ setOperationAction(ISD::FREM , MVT::f128 , Expand);
setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
// Promote the i8 variants and force them on up to i32 which has a shorter
@@ -396,15 +378,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// There's never any support for operations beyond MVT::f32.
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
+ setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f16, Expand);
if (Subtarget.hasPOPCNT()) {
setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
@@ -638,17 +624,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::f64, Expand);
setOperationAction(ISD::FMA, MVT::f32, Expand);
- // Long double always uses X87, except f128 in MMX.
+ // f80 always uses X87.
if (UseX87) {
- if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
- addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
- : &X86::VR128RegClass);
- ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
- setOperationAction(ISD::FABS , MVT::f128, Custom);
- setOperationAction(ISD::FNEG , MVT::f128, Custom);
- setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
- }
-
addRegisterClass(MVT::f80, &X86::RFP80RegClass);
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
@@ -684,10 +661,60 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LLRINT, MVT::f80, Expand);
}
+ // f128 uses xmm registers, but most operations require libcalls.
+ if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
+ addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+
+ addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
+
+ setOperationAction(ISD::FADD, MVT::f128, Custom);
+ setOperationAction(ISD::FSUB, MVT::f128, Custom);
+ setOperationAction(ISD::FDIV, MVT::f128, Custom);
+ setOperationAction(ISD::FMUL, MVT::f128, Custom);
+ setOperationAction(ISD::FMA, MVT::f128, Expand);
+
+ setOperationAction(ISD::FABS, MVT::f128, Custom);
+ setOperationAction(ISD::FNEG, MVT::f128, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+
+ setOperationAction(ISD::FSIN, MVT::f128, Expand);
+ setOperationAction(ISD::FCOS, MVT::f128, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+ setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+ // We need to custom handle any FP_ROUND with an f128 input, but
+ // LegalizeDAG uses the result type to know when to run a custom handler.
+ // So we have to list all legal floating point result types here.
+ if (isTypeLegal(MVT::f32)) {
+ setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
+ }
+ if (isTypeLegal(MVT::f64)) {
+ setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
+ }
+ if (isTypeLegal(MVT::f80)) {
+ setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
+ }
+
+ setOperationAction(ISD::SETCC, MVT::f128, Custom);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+ }
+
// Always use a library call for pow.
setOperationAction(ISD::FPOW , MVT::f32 , Expand);
setOperationAction(ISD::FPOW , MVT::f64 , Expand);
setOperationAction(ISD::FPOW , MVT::f80 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f128 , Expand);
setOperationAction(ISD::FLOG, MVT::f80, Expand);
setOperationAction(ISD::FLOG2, MVT::f80, Expand);
@@ -716,7 +743,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
// turn on ones that can be effectively codegen'd.
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
@@ -754,7 +781,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
setOperationAction(ISD::ANY_EXTEND, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
- for (MVT InnerVT : MVT::vector_valuetypes()) {
+ for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(InnerVT, VT, Expand);
setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
@@ -797,6 +824,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
+
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -823,10 +852,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
setOperationAction(ISD::MUL, MVT::v2i8, Custom);
- setOperationAction(ISD::MUL, MVT::v2i16, Custom);
- setOperationAction(ISD::MUL, MVT::v2i32, Custom);
setOperationAction(ISD::MUL, MVT::v4i8, Custom);
- setOperationAction(ISD::MUL, MVT::v4i16, Custom);
setOperationAction(ISD::MUL, MVT::v8i8, Custom);
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
@@ -863,28 +889,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
- if (!ExperimentalVectorWideningLegalization) {
- // Use widening instead of promotion.
- for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8,
- MVT::v4i16, MVT::v2i16 }) {
- setOperationAction(ISD::UADDSAT, VT, Custom);
- setOperationAction(ISD::SADDSAT, VT, Custom);
- setOperationAction(ISD::USUBSAT, VT, Custom);
- setOperationAction(ISD::SSUBSAT, VT, Custom);
- }
- }
-
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
- // Provide custom widening for v2f32 setcc. This is really for VLX when
- // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
- // type legalization changing the result type to v4i1 during widening.
- // It works fine for SSE2 and is probably faster so no need to qualify with
- // VLX support.
- setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
-
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
@@ -904,19 +912,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
- // We support custom legalizing of sext and anyext loads for specific
- // memory vector types which we can load as a scalar (or sequence of
- // scalars) and extend in-register to a legal 128-bit vector type. For sext
- // loads these must work with a single scalar load.
- for (MVT VT : MVT::integer_vector_valuetypes()) {
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
- }
-
for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
@@ -938,7 +933,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
// Custom legalize these to avoid over promotion or custom promotion.
setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom);
@@ -991,18 +985,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
- if (ExperimentalVectorWideningLegalization) {
- setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
- } else {
- setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
- }
+ setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
// In the customized shift lowering, the legal v4i32/v2i64 cases
// in AVX2 will be recognized.
@@ -1069,22 +1059,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
}
- if (!ExperimentalVectorWideningLegalization) {
- // Avoid narrow result types when widening. The legal types are listed
- // in the next loop.
- for (MVT VT : MVT::integer_vector_valuetypes()) {
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
- }
- }
-
// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
- if (!ExperimentalVectorWideningLegalization)
- setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
@@ -1145,6 +1123,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Custom);
+
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
@@ -1292,10 +1272,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STORE, VT, Custom);
}
- if (HasInt256)
- setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
-
if (HasInt256) {
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+
// Custom legalize 2x32 to get a little better code.
setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
@@ -1407,6 +1386,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f32, Custom);
+
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
@@ -1433,12 +1414,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
- if (ExperimentalVectorWideningLegalization) {
- // Need to custom widen this if we don't have AVX512BW.
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
- }
+ // Need to custom widen this if we don't have AVX512BW.
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
@@ -1529,10 +1508,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}
- // Need to custom split v32i16/v64i8 bitcasts.
if (!Subtarget.hasBWI()) {
+ // Need to custom split v32i16/v64i8 bitcasts.
setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
+
+ // Better to split these into two 256-bit ops.
+ setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);
}
if (Subtarget.hasVBMI2()) {
@@ -1777,6 +1760,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FSHR, VT, Custom);
}
}
+
+ setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
}
// We want to custom lower some of our intrinsics.
@@ -1905,13 +1892,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
MaxLoadsPerMemcmpOptSize = 2;
// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
- setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
+ setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
// An out-of-order CPU can speculatively execute past a predictable branch,
// but a conditional move could be stalled by an expensive earlier operation.
PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
EnableExtLdPromotion = true;
- setPrefFunctionAlignment(4); // 2^4 bytes.
+ setPrefFunctionAlignment(Align(16));
verifyIntrinsicTables();
}
@@ -1939,8 +1926,7 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return TypeSplitVector;
- if (ExperimentalVectorWideningLegalization &&
- VT.getVectorNumElements() != 1 &&
+ if (VT.getVectorNumElements() != 1 &&
VT.getVectorElementType() != MVT::i1)
return TypeWidenVector;
@@ -1950,19 +1936,62 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
+ // v32i1 vectors should be promoted to v32i8 to match avx2.
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return MVT::v32i8;
+ // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ Subtarget.hasAVX512() &&
+ (!isPowerOf2_32(VT.getVectorNumElements()) ||
+ (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
+ (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
+ return MVT::i8;
+ // FIXME: Should we just make these types legal and custom split operations?
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
+ Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
+ return MVT::v16i32;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
+ // v32i1 vectors should be promoted to v32i8 to match avx2.
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return 1;
+ // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ Subtarget.hasAVX512() &&
+ (!isPowerOf2_32(VT.getVectorNumElements()) ||
+ (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
+ (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
+ return VT.getVectorNumElements();
+ // FIXME: Should we just make these types legal and custom split operations?
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
+ Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
+ return 1;
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
+unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+ unsigned &NumIntermediates, MVT &RegisterVT) const {
+ // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ Subtarget.hasAVX512() &&
+ (!isPowerOf2_32(VT.getVectorNumElements()) ||
+ (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
+ (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
+ RegisterVT = MVT::i8;
+ IntermediateVT = MVT::i1;
+ NumIntermediates = VT.getVectorNumElements();
+ return NumIntermediates;
+ }
+
+ return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
+ NumIntermediates, RegisterVT);
+}
+
EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext& Context,
EVT VT) const {
@@ -2060,6 +2089,11 @@ EVT X86TargetLowering::getOptimalMemOpType(
if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
((DstAlign == 0 || DstAlign >= 16) &&
(SrcAlign == 0 || SrcAlign >= 16)))) {
+ // FIXME: Check if unaligned 64-byte accesses are slow.
+ if (Size >= 64 && Subtarget.hasAVX512() &&
+ (Subtarget.getPreferVectorWidth() >= 512)) {
+ return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
+ }
// FIXME: Check if unaligned 32-byte accesses are slow.
if (Size >= 32 && Subtarget.hasAVX() &&
(Subtarget.getPreferVectorWidth() >= 256)) {
@@ -2403,8 +2437,8 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
/// Breaks v64i1 value into two registers and adds the new node to the DAG
static void Passv64i1ArgInRegs(
- const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
- SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
+ const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
+ SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,
CCValAssign &NextVA, const X86Subtarget &Subtarget) {
assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
@@ -2537,7 +2571,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
- Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
+ Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I],
Subtarget);
assert(2 == RegsToPass.size() &&
@@ -2816,6 +2850,10 @@ SDValue X86TargetLowering::LowerCallResult(
((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ } else if (CopyVT == MVT::f64 &&
+ (Is64Bit && !Subtarget.hasSSE2())) {
+ errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
// If we prefer to use the value in xmm registers, copy it out as f80 and
@@ -2925,7 +2963,7 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
static bool canGuaranteeTCO(CallingConv::ID CC) {
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
- CC == CallingConv::HHVM);
+ CC == CallingConv::HHVM || CC == CallingConv::Tail);
}
/// Return true if we might ever do TCO for calls with this calling convention.
@@ -2951,7 +2989,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
/// Return true if the function is being made into a tailcall target by
/// changing its ABI.
static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
- return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
+ return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
}
bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
@@ -3405,7 +3443,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// Find the largest legal vector type.
MVT VecVT = MVT::Other;
// FIXME: Only some x86_32 calling conventions support AVX512.
- if (Subtarget.hasAVX512() &&
+ if (Subtarget.useAVX512Regs() &&
(Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
CallConv == CallingConv::Intel_OCL_BI)))
VecVT = MVT::v16f32;
@@ -3577,6 +3615,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
bool IsSibcall = false;
+ bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
+ CallConv == CallingConv::Tail;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
@@ -3597,8 +3637,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (Attr.getValueAsString() == "true")
isTailCall = false;
- if (Subtarget.isPICStyleGOT() &&
- !MF.getTarget().Options.GuaranteedTailCallOpt) {
+ if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
// If we are using a GOT, disable tail calls to external symbols with
// default visibility. Tail calling such a symbol requires using a GOT
// relocation, which forces early binding of the symbol. This breaks code
@@ -3625,7 +3664,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Sibcalls are automatically detected tailcalls which do not require
// ABI changes.
- if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
+ if (!IsGuaranteeTCO && isTailCall)
IsSibcall = true;
if (isTailCall)
@@ -3657,8 +3696,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// This is a sibcall. The memory operands are available in caller's
// own caller's stack.
NumBytes = 0;
- else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
- canGuaranteeTCO(CallConv))
+ else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
@@ -3782,8 +3820,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
// Split v64i1 value into two registers
- Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
- Subtarget);
+ Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
} else if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
const TargetOptions &Options = DAG.getTarget().Options;
@@ -4069,6 +4106,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
InFlag = Chain.getValue(1);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
+ // Save heapallocsite metadata.
+ if (CLI.CS)
+ if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
+ DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
+
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPop;
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
@@ -4190,7 +4232,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
int FI = INT_MAX;
if (Arg.getOpcode() == ISD::CopyFromReg) {
unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
- if (!TargetRegisterInfo::isVirtualRegister(VR))
+ if (!Register::isVirtualRegister(VR))
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
@@ -4279,6 +4321,8 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
bool CCMatch = CallerCC == CalleeCC;
bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
+ bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
+ CalleeCC == CallingConv::Tail;
// Win64 functions have extra shadow space for argument homing. Don't do the
// sibcall if the caller and callee have mismatched expectations for this
@@ -4286,7 +4330,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
if (IsCalleeWin64 != IsCallerWin64)
return false;
- if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+ if (IsGuaranteeTCO) {
if (canGuaranteeTCO(CalleeCC) && CCMatch)
return true;
return false;
@@ -4413,7 +4457,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
CCValAssign &VA = ArgLocs[i];
if (!VA.isRegLoc())
continue;
- unsigned Reg = VA.getLocReg();
+ Register Reg = VA.getLocReg();
switch (Reg) {
default: break;
case X86::EAX: case X86::EDX: case X86::ECX:
@@ -4652,7 +4696,11 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
// X < 0 -> X == 0, jump on sign.
return X86::COND_S;
}
- if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+ if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
+ // X >= 0 -> X == 0, jump on !sign.
+ return X86::COND_NS;
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->getAPIntValue() == 1) {
// X < 1 -> X <= 0
RHS = DAG.getConstant(0, DL, RHS.getValueType());
return X86::COND_LE;
@@ -4760,7 +4808,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
ScalarVT = MVT::i32;
Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
- Info.align = 1;
+ Info.align = Align::None();
Info.flags |= MachineMemOperand::MOStore;
break;
}
@@ -4773,7 +4821,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
- Info.align = 1;
+ Info.align = Align::None();
Info.flags |= MachineMemOperand::MOLoad;
break;
}
@@ -4785,7 +4833,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
- Info.align = 1;
+ Info.align = Align::None();
Info.flags |= MachineMemOperand::MOStore;
break;
}
@@ -4811,6 +4859,8 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtTy,
EVT NewVT) const {
+ assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
+
// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
// relocation target a movq or addq instruction: don't let the load shrink.
SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
@@ -4852,11 +4902,12 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return true;
}
-bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
+bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
// If we are using XMM registers in the ABI and the condition of the select is
// a floating-point compare and we have blendv or conditional move, then it is
// cheaper to select instead of doing a cross-register move and creating a
// load that depends on the compare result.
+ bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
}
@@ -4869,15 +4920,25 @@ bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
return true;
}
-bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
+bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const {
// TODO: We handle scalars using custom code, but generic combining could make
// that unnecessary.
APInt MulC;
if (!ISD::isConstantSplatVector(C.getNode(), MulC))
return false;
+ // Find the type this will be legalized too. Otherwise we might prematurely
+ // convert this to shl+add/sub and then still have to type legalize those ops.
+ // Another choice would be to defer the decision for illegal types until
+ // after type legalization. But constant splat vectors of i64 can't make it
+ // through type legalization on 32-bit targets so we would need to special
+ // case vXi64.
+ while (getTypeAction(Context, VT) != TypeLegal)
+ VT = getTypeToTransformTo(Context, VT);
+
// If vector multiply is legal, assume that's faster than shl + add/sub.
- // TODO: Multiply is a complex op with higher latency and lower througput in
+ // TODO: Multiply is a complex op with higher latency and lower throughput in
// most implementations, so this check could be loosened based on type
// and/or a CPU attribute.
if (isOperationLegal(ISD::MUL, VT))
@@ -5022,6 +5083,33 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const {
return Subtarget.hasSSE2();
}
+bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
+ return X.getValueType().isScalarInteger(); // 'bt'
+}
+
+bool X86TargetLowering::
+ shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+ unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+ SelectionDAG &DAG) const {
+ // Does baseline recommend not to perform the fold by default?
+ if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
+ return false;
+ // For scalars this transform is always beneficial.
+ if (X.getValueType().isScalarInteger())
+ return true;
+ // If all the shift amounts are identical, then transform is beneficial even
+ // with rudimentary SSE2 shifts.
+ if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
+ return true;
+ // If we have AVX2 with it's powerful shift operations, then it's also good.
+ if (Subtarget.hasAVX2())
+ return true;
+ // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
+ return NewShiftOpcode == ISD::SHL;
+}
+
bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {
assert(((N->getOpcode() == ISD::SHL &&
@@ -5054,6 +5142,14 @@ bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
return true;
}
+bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
+ SDNode *N) const {
+ if (DAG.getMachineFunction().getFunction().hasMinSize() &&
+ !Subtarget.isOSWindows())
+ return false;
+ return true;
+}
+
bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
// Any legal vector type can be splatted more efficiently than
// loading/spilling from memory.
@@ -5093,10 +5189,8 @@ static bool isUndefOrZero(int Val) {
/// Return true if every element in Mask, beginning from position Pos and ending
/// in Pos+Size is the undef sentinel value.
static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
- for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
- if (Mask[i] != SM_SentinelUndef)
- return false;
- return true;
+ return llvm::all_of(Mask.slice(Pos, Size),
+ [](int M) { return M == SM_SentinelUndef; });
}
/// Return true if the mask creates a vector whose lower half is undefined.
@@ -5119,10 +5213,7 @@ static bool isInRange(int Val, int Low, int Hi) {
/// Return true if the value of any element in Mask falls within the specified
/// range (L, H].
static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
- for (int M : Mask)
- if (isInRange(M, Low, Hi))
- return true;
- return false;
+ return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
}
/// Return true if Val is undef or if its value falls within the
@@ -5133,12 +5224,9 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
/// Return true if every element in Mask is undef or if its value
/// falls within the specified range (L, H].
-static bool isUndefOrInRange(ArrayRef<int> Mask,
- int Low, int Hi) {
- for (int M : Mask)
- if (!isUndefOrInRange(M, Low, Hi))
- return false;
- return true;
+static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
+ return llvm::all_of(
+ Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
}
/// Return true if Val is undef, zero or if its value falls within the
@@ -5150,10 +5238,8 @@ static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
/// Return true if every element in Mask is undef, zero or if its value
/// falls within the specified range (L, H].
static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
- for (int M : Mask)
- if (!isUndefOrZeroOrInRange(M, Low, Hi))
- return false;
- return true;
+ return llvm::all_of(
+ Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
}
/// Return true if every element in Mask, beginning
@@ -5171,8 +5257,9 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
/// from position Pos and ending in Pos+Size, falls within the specified
/// sequential range (Low, Low+Size], or is undef or is zero.
static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
- unsigned Size, int Low) {
- for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
+ unsigned Size, int Low,
+ int Step = 1) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
return false;
return true;
@@ -5182,10 +5269,8 @@ static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
/// from position Pos and ending in Pos+Size is undef or is zero.
static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
unsigned Size) {
- for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
- if (!isUndefOrZero(Mask[i]))
- return false;
- return true;
+ return llvm::all_of(Mask.slice(Pos, Size),
+ [](int M) { return isUndefOrZero(M); });
}
/// Helper function to test whether a shuffle mask could be
@@ -5357,6 +5442,8 @@ static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
SDValue Vec;
if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
+ } else if (VT.isFloatingPoint()) {
+ Vec = DAG.getConstantFP(+0.0, dl, VT);
} else if (VT.getVectorElementType() == MVT::i1) {
assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type");
@@ -5500,6 +5587,7 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
Idx == (VT.getVectorNumElements() / 2) &&
Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(1).getValueType() == SubVT &&
isNullConstant(Src.getOperand(2))) {
Ops.push_back(Src.getOperand(1));
Ops.push_back(Sub);
@@ -5593,7 +5681,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
// May need to promote to a legal type.
Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- getZeroVector(WideOpVT, Subtarget, DAG, dl),
+ DAG.getConstant(0, dl, WideOpVT),
SubVec, Idx);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
@@ -5609,14 +5697,14 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
if (IdxVal == 0) {
// Zero lower bits of the Vec
- SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+ SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
ZeroIdx);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
// Merge them together, SubVec should be zero extended.
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- getZeroVector(WideOpVT, Subtarget, DAG, dl),
+ DAG.getConstant(0, dl, WideOpVT),
SubVec, ZeroIdx);
Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
@@ -5628,7 +5716,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
if (Vec.isUndef()) {
assert(IdxVal != 0 && "Unexpected index");
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
@@ -5638,30 +5726,30 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
unsigned ShiftLeft = NumElems - SubVecNumElems;
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
- DAG.getConstant(ShiftLeft, dl, MVT::i8));
+ DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
if (ShiftRight != 0)
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
- DAG.getConstant(ShiftRight, dl, MVT::i8));
+ DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
// Simple case when we put subvector in the upper part
if (IdxVal + SubVecNumElems == NumElems) {
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
if (SubVecNumElems * 2 == NumElems) {
// Special case, use legal zero extending insert_subvector. This allows
// isel to opimitize when bits are known zero.
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- getZeroVector(WideOpVT, Subtarget, DAG, dl),
+ DAG.getConstant(0, dl, WideOpVT),
Vec, ZeroIdx);
} else {
// Otherwise use explicit shifts to zero the bits.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
Undef, Vec, ZeroIdx);
NumElems = WideOpVT.getVectorNumElements();
- SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
+ SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
}
@@ -5675,30 +5763,47 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
// Widen the vector if needed.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
- // Move the current value of the bit to be replace to the lsbs.
- Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
- // Xor with the new bit.
- Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
- // Shift to MSB, filling bottom bits with 0.
+
+ // Clear the upper bits of the subvector and move it to its insert position.
unsigned ShiftLeft = NumElems - SubVecNumElems;
- Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
- DAG.getConstant(ShiftLeft, dl, MVT::i8));
- // Shift to the final position, filling upper bits with 0.
+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
- Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
- DAG.getConstant(ShiftRight, dl, MVT::i8));
- // Xor with original vector leaving the new value.
- Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
+ SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
+
+ // Isolate the bits below the insertion point.
+ unsigned LowShift = NumElems - IdxVal;
+ SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
+ DAG.getTargetConstant(LowShift, dl, MVT::i8));
+ Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
+ DAG.getTargetConstant(LowShift, dl, MVT::i8));
+
+ // Isolate the bits after the last inserted bit.
+ unsigned HighShift = IdxVal + SubVecNumElems;
+ SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
+ DAG.getTargetConstant(HighShift, dl, MVT::i8));
+ High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
+ DAG.getTargetConstant(HighShift, dl, MVT::i8));
+
+ // Now OR all 3 pieces together.
+ Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
+ SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
+
// Reduce to original width if needed.
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
-static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
- unsigned NumElems, SelectionDAG &DAG,
- const SDLoc &dl, unsigned VectorWidth) {
- SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
- return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
+static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
+ EVT SubVT = V1.getValueType();
+ EVT SubSVT = SubVT.getScalarType();
+ unsigned SubNumElts = SubVT.getVectorNumElements();
+ unsigned SubVectorWidth = SubVT.getSizeInBits();
+ EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
+ SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
+ return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
}
/// Returns a vector of specified type with all bits set.
@@ -5755,6 +5860,34 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
return DAG.getNode(Opcode, DL, VT, In);
}
+// Match (xor X, -1) -> X.
+// Match extract_subvector(xor X, -1) -> extract_subvector(X).
+// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
+ V = peekThroughBitcasts(V);
+ if (V.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
+ return V.getOperand(0);
+ if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
+ if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
+ Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
+ Not, V.getOperand(1));
+ }
+ }
+ SmallVector<SDValue, 2> CatOps;
+ if (collectConcatOps(V.getNode(), CatOps)) {
+ for (SDValue &CatOp : CatOps) {
+ SDValue NotCat = IsNOT(CatOp, DAG);
+ if (!NotCat) return SDValue();
+ CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
+ }
+ return SDValue();
+}
+
/// Returns a vector_shuffle node for an unpackl operation.
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
@@ -6003,6 +6136,37 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
}
+ if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
+ EltSizeInBits <= VT.getScalarSizeInBits()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return false;
+
+ SDValue Ptr = MemIntr->getBasePtr();
+ if (Ptr->getOpcode() == X86ISD::Wrapper ||
+ Ptr->getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr->getOperand(0);
+
+ auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
+ if (!CNode || CNode->isMachineConstantPoolEntry() ||
+ CNode->getOffset() != 0)
+ return false;
+
+ if (const Constant *C = CNode->getConstVal()) {
+ unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
+ if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
+ if (UndefSrcElts[0])
+ UndefSrcElts.setBits(0, NumSrcElts);
+ SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+ }
+ }
+
// Extract constant bits from a subvector broadcast.
if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
SmallVector<APInt, 16> SubEltBits;
@@ -6123,7 +6287,9 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
return false;
}
-static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
+namespace llvm {
+namespace X86 {
+bool isConstantSplat(SDValue Op, APInt &SplatVal) {
APInt UndefElts;
SmallVector<APInt, 16> EltBits;
if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
@@ -6146,6 +6312,8 @@ static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
return false;
}
+} // namespace X86
+} // namespace llvm
static bool getTargetShuffleMaskIndices(SDValue MaskNode,
unsigned MaskEltSizeInBits,
@@ -6551,13 +6719,12 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return true;
}
-/// Check a target shuffle mask's inputs to see if we can set any values to
-/// SM_SentinelZero - this is for elements that are known to be zero
-/// (not just zeroable) from their inputs.
+/// Decode a target shuffle mask and inputs and see if any values are
+/// known to be undef or zero from their inputs.
/// Returns true if the target shuffle mask was decoded.
-static bool setTargetShuffleZeroElements(SDValue N,
- SmallVectorImpl<int> &Mask,
- SmallVectorImpl<SDValue> &Ops) {
+static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SDValue> &Ops,
+ APInt &KnownUndef, APInt &KnownZero) {
bool IsUnary;
if (!isTargetShuffle(N.getOpcode()))
return false;
@@ -6566,15 +6733,17 @@ static bool setTargetShuffleZeroElements(SDValue N,
if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
return false;
+ int Size = Mask.size();
SDValue V1 = Ops[0];
SDValue V2 = IsUnary ? V1 : Ops[1];
+ KnownUndef = KnownZero = APInt::getNullValue(Size);
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
assert((VT.getSizeInBits() % Mask.size()) == 0 &&
"Illegal split of shuffle value type");
- unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
+ unsigned EltSizeInBits = VT.getSizeInBits() / Size;
// Extract known constant input data.
APInt UndefSrcElts[2];
@@ -6585,12 +6754,18 @@ static bool setTargetShuffleZeroElements(SDValue N,
getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
SrcEltBits[1], true, false)};
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ for (int i = 0; i < Size; ++i) {
int M = Mask[i];
// Already decoded as SM_SentinelZero / SM_SentinelUndef.
- if (M < 0)
+ if (M < 0) {
+ assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
+ if (SM_SentinelUndef == M)
+ KnownUndef.setBit(i);
+ if (SM_SentinelZero == M)
+ KnownZero.setBit(i);
continue;
+ }
// Determine shuffle input and normalize the mask.
unsigned SrcIdx = M / Size;
@@ -6599,7 +6774,7 @@ static bool setTargetShuffleZeroElements(SDValue N,
// We are referencing an UNDEF input.
if (V.isUndef()) {
- Mask[i] = SM_SentinelUndef;
+ KnownUndef.setBit(i);
continue;
}
@@ -6612,31 +6787,64 @@ static bool setTargetShuffleZeroElements(SDValue N,
int Scale = Size / V.getValueType().getVectorNumElements();
int Idx = M / Scale;
if (Idx != 0 && !VT.isFloatingPoint())
- Mask[i] = SM_SentinelUndef;
+ KnownUndef.setBit(i);
else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
- Mask[i] = SM_SentinelZero;
+ KnownZero.setBit(i);
continue;
}
// Attempt to extract from the source's constant bits.
if (IsSrcConstant[SrcIdx]) {
if (UndefSrcElts[SrcIdx][M])
- Mask[i] = SM_SentinelUndef;
+ KnownUndef.setBit(i);
else if (SrcEltBits[SrcIdx][M] == 0)
- Mask[i] = SM_SentinelZero;
+ KnownZero.setBit(i);
}
}
- assert(VT.getVectorNumElements() == Mask.size() &&
+ assert(VT.getVectorNumElements() == (unsigned)Size &&
"Different mask size from vector size!");
return true;
}
+// Replace target shuffle mask elements with known undef/zero sentinels.
+static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
+ const APInt &KnownUndef,
+ const APInt &KnownZero) {
+ unsigned NumElts = Mask.size();
+ assert(KnownUndef.getBitWidth() == NumElts &&
+ KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (KnownUndef[i])
+ Mask[i] = SM_SentinelUndef;
+ else if (KnownZero[i])
+ Mask[i] = SM_SentinelZero;
+ }
+}
+
+// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
+static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
+ APInt &KnownUndef,
+ APInt &KnownZero) {
+ unsigned NumElts = Mask.size();
+ KnownUndef = KnownZero = APInt::getNullValue(NumElts);
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ if (SM_SentinelUndef == M)
+ KnownUndef.setBit(i);
+ if (SM_SentinelZero == M)
+ KnownZero.setBit(i);
+ }
+}
+
// Forward declaration (for getFauxShuffleMask recursive check).
-static bool resolveTargetShuffleInputs(SDValue Op,
- SmallVectorImpl<SDValue> &Inputs,
- SmallVectorImpl<int> &Mask,
- SelectionDAG &DAG);
+// TODO: Use DemandedElts variant.
+static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask,
+ SelectionDAG &DAG, unsigned Depth,
+ bool ResolveKnownElts);
// Attempt to decode ops that could be represented as a shuffle mask.
// The decoded shuffle mask may contain a different number of elements to the
@@ -6644,7 +6852,8 @@ static bool resolveTargetShuffleInputs(SDValue Op,
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG, unsigned Depth,
+ bool ResolveKnownElts) {
Mask.clear();
Ops.clear();
@@ -6685,7 +6894,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
Mask.push_back(SM_SentinelUndef);
continue;
}
- uint64_t ByteBits = EltBits[i].getZExtValue();
+ const APInt &ByteBits = EltBits[i];
if (ByteBits != 0 && ByteBits != 255)
return false;
Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
@@ -6696,8 +6905,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
case ISD::OR: {
// Inspect each operand at the byte level. We can merge these into a
// blend shuffle mask if for each byte at least one is masked out (zero).
- KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts);
- KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts);
+ KnownBits Known0 =
+ DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
+ KnownBits Known1 =
+ DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
bool IsByteMask = true;
unsigned NumSizeInBytes = NumSizeInBits / 8;
@@ -6736,14 +6947,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
return false;
SmallVector<int, 64> SrcMask0, SrcMask1;
SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
- if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) ||
- !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
+ if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
+ true) ||
+ !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
+ true))
return false;
- int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
+ size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
SmallVector<int, 64> Mask0, Mask1;
scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
- for (int i = 0; i != MaskSize; ++i) {
+ for (size_t i = 0; i != MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
Mask.push_back(SM_SentinelUndef);
else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
@@ -6751,14 +6964,12 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
else if (Mask1[i] == SM_SentinelZero)
Mask.push_back(Mask0[i]);
else if (Mask0[i] == SM_SentinelZero)
- Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size()));
+ Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
else
return false;
}
- for (SDValue &Op : SrcInputs0)
- Ops.push_back(Op);
- for (SDValue &Op : SrcInputs1)
- Ops.push_back(Op);
+ Ops.append(SrcInputs0.begin(), SrcInputs0.end());
+ Ops.append(SrcInputs1.begin(), SrcInputs1.end());
return true;
}
case ISD::INSERT_SUBVECTOR: {
@@ -6786,8 +6997,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubInputs;
- if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
- SubMask, DAG))
+ if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
+ SubMask, DAG, Depth + 1, ResolveKnownElts))
return false;
if (SubMask.size() != NumSubElts) {
assert(((SubMask.size() % NumSubElts) == 0 ||
@@ -6911,14 +7122,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
// as a truncation shuffle.
if (Opcode == X86ISD::PACKSS) {
if ((!N0.isUndef() &&
- DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) ||
+ DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
(!N1.isUndef() &&
- DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt))
+ DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
return false;
} else {
APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
- if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) ||
- (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS)))
+ if ((!N0.isUndef() &&
+ !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
+ (!N1.isUndef() &&
+ !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
return false;
}
@@ -7061,23 +7274,45 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
Inputs = UsedInputs;
}
-/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
-/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
-/// remaining input indices in case we now have a unary shuffle and adjust the
-/// inputs accordingly.
+/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
+/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
/// Returns true if the target shuffle mask was decoded.
-static bool resolveTargetShuffleInputs(SDValue Op,
- SmallVectorImpl<SDValue> &Inputs,
- SmallVectorImpl<int> &Mask,
- SelectionDAG &DAG) {
+static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
+ SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask,
+ APInt &KnownUndef, APInt &KnownZero,
+ SelectionDAG &DAG, unsigned Depth,
+ bool ResolveKnownElts) {
+ EVT VT = Op.getValueType();
+ if (!VT.isSimple() || !VT.isVector())
+ return false;
+
+ if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
+ if (ResolveKnownElts)
+ resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
+ return true;
+ }
+ if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
+ ResolveKnownElts)) {
+ resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
+ return true;
+ }
+ return false;
+}
+
+static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask,
+ SelectionDAG &DAG, unsigned Depth = 0,
+ bool ResolveKnownElts = true) {
+ EVT VT = Op.getValueType();
+ if (!VT.isSimple() || !VT.isVector())
+ return false;
+
+ APInt KnownUndef, KnownZero;
unsigned NumElts = Op.getValueType().getVectorNumElements();
APInt DemandedElts = APInt::getAllOnesValue(NumElts);
- if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
- if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG))
- return false;
-
- resolveTargetShuffleInputsAndMask(Inputs, Mask);
- return true;
+ return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
+ KnownZero, DAG, Depth, ResolveKnownElts);
}
/// Returns the scalar element that will make up the ith
@@ -7414,7 +7649,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
SDLoc DL(Op);
SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getIntPtrConstant(InsertPSMask, DL));
+ DAG.getIntPtrConstant(InsertPSMask, DL, true));
return DAG.getBitcast(VT, Result);
}
@@ -7427,7 +7662,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getBitcast(ShVT, SrcOp);
assert(NumBits % 8 == 0 && "Only support byte sized shifts");
- SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
+ SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
}
@@ -7439,7 +7674,7 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
// the shuffle mask.
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
SDValue Ptr = LD->getBasePtr();
- if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+ if (!ISD::isNormalLoad(LD) || !LD->isSimple())
return SDValue();
EVT PVT = LD->getValueType(0);
if (PVT != MVT::i32 && PVT != MVT::f32)
@@ -7504,6 +7739,49 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
return SDValue();
}
+// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
+static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
+ if (ISD::isNON_EXTLoad(Elt.getNode())) {
+ auto *BaseLd = cast<LoadSDNode>(Elt);
+ if (!BaseLd->isSimple())
+ return false;
+ Ld = BaseLd;
+ ByteOffset = 0;
+ return true;
+ }
+
+ switch (Elt.getOpcode()) {
+ case ISD::BITCAST:
+ case ISD::TRUNCATE:
+ case ISD::SCALAR_TO_VECTOR:
+ return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
+ case ISD::SRL:
+ if (isa<ConstantSDNode>(Elt.getOperand(1))) {
+ uint64_t Idx = Elt.getConstantOperandVal(1);
+ if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
+ ByteOffset += Idx / 8;
+ return true;
+ }
+ }
+ break;
+ case ISD::EXTRACT_VECTOR_ELT:
+ if (isa<ConstantSDNode>(Elt.getOperand(1))) {
+ SDValue Src = Elt.getOperand(0);
+ unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
+ unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
+ if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
+ findEltLoadSrc(Src, Ld, ByteOffset)) {
+ uint64_t Idx = Elt.getConstantOperandVal(1);
+ ByteOffset += Idx * (SrcSizeInBits / 8);
+ return true;
+ }
+ }
+ break;
+ }
+
+ return false;
+}
+
/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
/// elements can be replaced by a single large load which has the same value as
/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
@@ -7513,6 +7791,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
bool isAfterLegalize) {
+ if ((VT.getScalarSizeInBits() % 8) != 0)
+ return SDValue();
+
unsigned NumElems = Elts.size();
int LastLoadedElt = -1;
@@ -7521,6 +7802,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
APInt UndefMask = APInt::getNullValue(NumElems);
SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
+ SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
// For each element in the initializer, see if we've found a load, zero or an
// undef.
@@ -7539,13 +7821,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// Each loaded element must be the correct fractional portion of the
// requested vector load.
- if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
+ unsigned EltSizeInBits = Elt.getValueSizeInBits();
+ if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
return SDValue();
- if (!ISD::isNON_EXTLoad(Elt.getNode()))
+ if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
+ return SDValue();
+ unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
+ if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
return SDValue();
- Loads[i] = cast<LoadSDNode>(Elt);
LoadMask.setBit(i);
LastLoadedElt = i;
}
@@ -7575,6 +7860,24 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
+ // TODO: Support offsetting the base load.
+ if (ByteOffsets[FirstLoadedElt] != 0)
+ return SDValue();
+
+ // Check to see if the element's load is consecutive to the base load
+ // or offset from a previous (already checked) load.
+ auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
+ LoadSDNode *Ld = Loads[EltIdx];
+ int64_t ByteOffset = ByteOffsets[EltIdx];
+ if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
+ int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
+ return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
+ Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
+ }
+ return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
+ EltIdx - FirstLoadedElt);
+ };
+
// Consecutive loads can contain UNDEFS but not ZERO elements.
// Consecutive loads with UNDEFs and ZEROs elements require a
// an additional shuffle stage to clear the ZERO elements.
@@ -7582,8 +7885,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
bool IsConsecutiveLoadWithZeros = true;
for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
if (LoadMask[i]) {
- if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes,
- i - FirstLoadedElt)) {
+ if (!CheckConsecutiveLoad(LDBase, i)) {
IsConsecutiveLoad = false;
IsConsecutiveLoadWithZeros = false;
break;
@@ -7595,8 +7897,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
auto MMOFlags = LDBase->getMemOperand()->getFlags();
- assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
- "Cannot merge volatile loads.");
+ assert(LDBase->isSimple() &&
+ "Cannot merge volatile or atomic loads.");
SDValue NewLd =
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
@@ -7636,17 +7938,22 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
// vector and a zero vector to clear out the zero elements.
if (!isAfterLegalize && VT.isVector()) {
- SmallVector<int, 4> ClearMask(NumElems, -1);
- for (unsigned i = 0; i < NumElems; ++i) {
- if (ZeroMask[i])
- ClearMask[i] = i + NumElems;
- else if (LoadMask[i])
- ClearMask[i] = i;
+ unsigned NumMaskElts = VT.getVectorNumElements();
+ if ((NumMaskElts % NumElems) == 0) {
+ unsigned Scale = NumMaskElts / NumElems;
+ SmallVector<int, 4> ClearMask(NumMaskElts, -1);
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (UndefMask[i])
+ continue;
+ int Offset = ZeroMask[i] ? NumMaskElts : 0;
+ for (unsigned j = 0; j != Scale; ++j)
+ ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
+ }
+ SDValue V = CreateLoad(VT, LDBase);
+ SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
+ : DAG.getConstantFP(0.0, DL, VT);
+ return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
}
- SDValue V = CreateLoad(VT, LDBase);
- SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
- : DAG.getConstantFP(0.0, DL, VT);
- return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
}
}
@@ -8194,34 +8501,10 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
"Unexpected type in LowerBUILD_VECTORvXi1!");
SDLoc dl(Op);
- if (ISD::isBuildVectorAllZeros(Op.getNode()))
- return Op;
-
- if (ISD::isBuildVectorAllOnes(Op.getNode()))
+ if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
+ ISD::isBuildVectorAllOnes(Op.getNode()))
return Op;
- if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
- if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
- // Split the pieces.
- SDValue Lower =
- DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
- SDValue Upper =
- DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
- // We have to manually lower both halves so getNode doesn't try to
- // reassemble the build_vector.
- Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
- Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
- }
- SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
- if (Imm.getValueSizeInBits() == VT.getSizeInBits())
- return DAG.getBitcast(VT, Imm);
- SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
- DAG.getIntPtrConstant(0, dl));
- }
-
- // Vector has one or more non-const elements
uint64_t Immediate = 0;
SmallVector<unsigned, 16> NonConstIdx;
bool IsSplat = true;
@@ -8244,29 +8527,40 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
}
// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
- if (IsSplat)
- return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
+ if (IsSplat) {
+ // The build_vector allows the scalar element to be larger than the vector
+ // element type. We need to mask it to use as a condition unless we know
+ // the upper bits are zero.
+ // FIXME: Use computeKnownBits instead of checking specific opcode?
+ SDValue Cond = Op.getOperand(SplatIdx);
+ assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
+ if (Cond.getOpcode() != ISD::SETCC)
+ Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
+ DAG.getConstant(1, dl, MVT::i8));
+ return DAG.getSelect(dl, VT, Cond,
DAG.getConstant(1, dl, VT),
DAG.getConstant(0, dl, VT));
+ }
// insert elements one by one
SDValue DstVec;
- SDValue Imm;
- if (Immediate) {
- MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
- Imm = DAG.getConstant(Immediate, dl, ImmVT);
- }
- else if (HasConstElts)
- Imm = DAG.getConstant(0, dl, VT);
- else
- Imm = DAG.getUNDEF(VT);
- if (Imm.getValueSizeInBits() == VT.getSizeInBits())
- DstVec = DAG.getBitcast(VT, Imm);
- else {
- SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
- DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
- DAG.getIntPtrConstant(0, dl));
- }
+ if (HasConstElts) {
+ if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+ SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
+ SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
+ ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
+ ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
+ DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
+ } else {
+ MVT ImmVT = MVT::getIntegerVT(std::max(VT.getSizeInBits(), 8U));
+ SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
+ MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
+ DstVec = DAG.getBitcast(VecVT, Imm);
+ DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ } else
+ DstVec = DAG.getUNDEF(VT);
for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
unsigned InsertIdx = NonConstIdx[i];
@@ -8757,7 +9051,7 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
// If we don't need the upper xmm, then perform as a xmm hop.
unsigned HalfNumElts = NumElts / 2;
if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
- MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts);
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
@@ -8965,21 +9259,14 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
MVT VT = Op.getSimpleValueType();
// Vectors containing all zeros can be matched by pxor and xorps.
- if (ISD::isBuildVectorAllZeros(Op.getNode())) {
- // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
- // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
- if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
- return Op;
-
- return getZeroVector(VT, Subtarget, DAG, DL);
- }
+ if (ISD::isBuildVectorAllZeros(Op.getNode()))
+ return Op;
// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
// vpcmpeqd on 256-bit vectors.
if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
- if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
- (VT == MVT::v8i32 && Subtarget.hasInt256()))
+ if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
return Op;
return getOnesVector(VT, DAG, DL);
@@ -9150,9 +9437,9 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
{4, 5, 6, 7, 4, 5, 6, 7});
if (Subtarget.hasXOP())
- return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
- LoLo, HiHi, IndicesVec,
- DAG.getConstant(0, DL, MVT::i8)));
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
+ IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
// Permute Lo and Hi and then select based on index range.
// This works as VPERMILPS only uses index bits[0:1] to permute elements.
SDValue Res = DAG.getSelectCC(
@@ -9186,9 +9473,9 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
if (Subtarget.hasXOP())
- return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
- LoLo, HiHi, IndicesVec,
- DAG.getConstant(0, DL, MVT::i8)));
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
+ IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
// Permute Lo and Hi and then select based on index range.
// This works as VPERMILPD only uses index bit[1] to permute elements.
SDValue Res = DAG.getSelectCC(
@@ -9283,7 +9570,7 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
return SDValue();
auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
- if (!PermIdx || PermIdx->getZExtValue() != Idx)
+ if (!PermIdx || PermIdx->getAPIntValue() != Idx)
return SDValue();
}
@@ -9434,23 +9721,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// it to i32 first.
if (EltVT == MVT::i16 || EltVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
- if (VT.getSizeInBits() >= 256) {
- MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
- if (Subtarget.hasAVX()) {
- Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
- Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
- } else {
- // Without AVX, we need to extend to a 128-bit vector and then
- // insert into the 256-bit vector.
- Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
- SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
- Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
- }
- } else {
- assert(VT.is128BitVector() && "Expected an SSE value type!");
- Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
- Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
- }
+ MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
return DAG.getBitcast(VT, Item);
}
}
@@ -9549,8 +9822,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
// Recreate the wider vector with the lower and upper part.
- return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
- VT.getSizeInBits() / 2);
+ return concatSubVectors(Lower, Upper, DAG, dl);
}
// Let legalizer expand 2-wide build_vectors.
@@ -9703,8 +9975,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
// If we have more than 2 non-zeros, build each half separately.
if (NumNonZero > 2) {
- MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
- ResVT.getVectorNumElements()/2);
+ MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
ArrayRef<SDUse> Ops = Op->ops();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(0, NumOperands/2));
@@ -9745,30 +10016,47 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS");
- unsigned NumZero = 0;
- unsigned NumNonZero = 0;
+ uint64_t Zeros = 0;
uint64_t NonZeros = 0;
for (unsigned i = 0; i != NumOperands; ++i) {
SDValue SubVec = Op.getOperand(i);
if (SubVec.isUndef())
continue;
+ assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
- ++NumZero;
- else {
- assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
+ Zeros |= (uint64_t)1 << i;
+ else
NonZeros |= (uint64_t)1 << i;
- ++NumNonZero;
- }
}
+ unsigned NumElems = ResVT.getVectorNumElements();
+
+ // If we are inserting non-zero vector and there are zeros in LSBs and undef
+ // in the MSBs we need to emit a KSHIFTL. The generic lowering to
+ // insert_subvector will give us two kshifts.
+ if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
+ Log2_64(NonZeros) != NumOperands - 1) {
+ MVT ShiftVT = ResVT;
+ if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
+ ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ unsigned Idx = Log2_64(NonZeros);
+ SDValue SubVec = Op.getOperand(Idx);
+ unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
+ SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
+ DAG.getUNDEF(ShiftVT), SubVec,
+ DAG.getIntPtrConstant(0, dl));
+ Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
+ DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
+ DAG.getIntPtrConstant(0, dl));
+ }
// If there are zero or one non-zeros we can handle this very simply.
- if (NumNonZero <= 1) {
- SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
- : DAG.getUNDEF(ResVT);
- if (!NumNonZero)
+ if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
+ SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
+ if (!NonZeros)
return Vec;
- unsigned Idx = countTrailingZeros(NonZeros);
+ unsigned Idx = Log2_64(NonZeros);
SDValue SubVec = Op.getOperand(Idx);
unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
@@ -9776,8 +10064,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
}
if (NumOperands > 2) {
- MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
- ResVT.getVectorNumElements()/2);
+ MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
ArrayRef<SDUse> Ops = Op->ops();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(0, NumOperands/2));
@@ -9786,7 +10073,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
}
- assert(NumNonZero == 2 && "Simple cases not handled?");
+ assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");
if (ResVT.getVectorNumElements() >= 16)
return Op; // The operation is legal with KUNPCK
@@ -9794,7 +10081,6 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
DAG.getUNDEF(ResVT), Op.getOperand(0),
DAG.getIntPtrConstant(0, dl));
- unsigned NumElems = ResVT.getVectorNumElements();
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
DAG.getIntPtrConstant(NumElems/2, dl));
}
@@ -9997,42 +10283,44 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
/// value in ExpectedMask is always accepted. Otherwise the indices must match.
///
-/// SM_SentinelZero is accepted as a valid negative index but must match in both.
+/// SM_SentinelZero is accepted as a valid negative index but must match in
+/// both.
static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
- ArrayRef<int> ExpectedMask) {
+ ArrayRef<int> ExpectedMask,
+ SDValue V1 = SDValue(),
+ SDValue V2 = SDValue()) {
int Size = Mask.size();
if (Size != (int)ExpectedMask.size())
return false;
assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask");
- for (int i = 0; i < Size; ++i)
- if (Mask[i] == SM_SentinelUndef)
- continue;
- else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
- return false;
- else if (Mask[i] != ExpectedMask[i])
- return false;
-
- return true;
-}
+ // Check for out-of-range target shuffle mask indices.
+ if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
+ return false;
-// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
-// mask.
-static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
- const APInt &Zeroable) {
- int NumElts = Mask.size();
- assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
+ // If the values are build vectors, we can look through them to find
+ // equivalent inputs that make the shuffles equivalent.
+ auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
+ auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
+ BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
+ BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);
- SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
- for (int i = 0; i != NumElts; ++i) {
- int M = Mask[i];
- if (M == SM_SentinelUndef)
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i])
continue;
- assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
- TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
+ if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
+ auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+ auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
+ if (MaskBV && ExpectedBV &&
+ MaskBV->getOperand(Mask[i] % Size) ==
+ ExpectedBV->getOperand(ExpectedMask[i] % Size))
+ continue;
+ }
+ // TODO - handle SM_Sentinel equivalences.
+ return false;
}
- return TargetMask;
+ return true;
}
// Attempt to create a shuffle mask from a VSELECT condition mask.
@@ -10133,7 +10421,7 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
SelectionDAG &DAG) {
- return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
+ return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
}
/// Compute whether each element of a shuffle is zeroable.
@@ -10573,14 +10861,14 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
// Try binary shuffle.
SmallVector<int, 32> BinaryMask;
createPackShuffleMask(VT, BinaryMask, false);
- if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
+ if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
if (MatchPACK(V1, V2))
return true;
// Try unary shuffle.
SmallVector<int, 32> UnaryMask;
createPackShuffleMask(VT, UnaryMask, true);
- if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
+ if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
if (MatchPACK(V1, V1))
return true;
@@ -10685,9 +10973,9 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
SelectionDAG &DAG);
static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
- MutableArrayRef<int> TargetMask,
- bool &ForceV1Zero, bool &ForceV2Zero,
- uint64_t &BlendMask) {
+ MutableArrayRef<int> Mask,
+ const APInt &Zeroable, bool &ForceV1Zero,
+ bool &ForceV2Zero, uint64_t &BlendMask) {
bool V1IsZeroOrUndef =
V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZeroOrUndef =
@@ -10695,13 +10983,12 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
BlendMask = 0;
ForceV1Zero = false, ForceV2Zero = false;
- assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
+ assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
// Attempt to generate the binary blend mask. If an input is zero then
// we can use any lane.
- // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
- for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
- int M = TargetMask[i];
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
if (M == i)
@@ -10710,16 +10997,16 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
BlendMask |= 1ull << i;
continue;
}
- if (M == SM_SentinelZero) {
+ if (Zeroable[i]) {
if (V1IsZeroOrUndef) {
ForceV1Zero = true;
- TargetMask[i] = i;
+ Mask[i] = i;
continue;
}
if (V2IsZeroOrUndef) {
ForceV2Zero = true;
BlendMask |= 1ull << i;
- TargetMask[i] = i + Size;
+ Mask[i] = i + Size;
continue;
}
}
@@ -10748,11 +11035,10 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
-
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
- if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
+ SmallVector<int, 64> Mask(Original.begin(), Original.end());
+ if (!matchVectorShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
BlendMask))
return SDValue();
@@ -10778,7 +11064,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
case MVT::v8i16:
assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8));
+ DAG.getTargetConstant(BlendMask, DL, MVT::i8));
case MVT::v16i16: {
assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
SmallVector<int, 8> RepeatedMask;
@@ -10790,7 +11076,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
if (RepeatedMask[i] >= 8)
BlendMask |= 1ull << i;
return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8));
+ DAG.getTargetConstant(BlendMask, DL, MVT::i8));
}
// Use PBLENDW for lower/upper lanes and then blend lanes.
// TODO - we should allow 2 PBLENDW here and leave shuffle combine to
@@ -10799,9 +11085,9 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
uint64_t HiMask = (BlendMask >> 8) & 0xFF;
if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
- DAG.getConstant(LoMask, DL, MVT::i8));
+ DAG.getTargetConstant(LoMask, DL, MVT::i8));
SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
- DAG.getConstant(HiMask, DL, MVT::i8));
+ DAG.getTargetConstant(HiMask, DL, MVT::i8));
return DAG.getVectorShuffle(
MVT::v16i16, DL, Lo, Hi,
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
@@ -11061,7 +11347,7 @@ static SDValue lowerShuffleAsByteRotateAndPermute(
SDValue Rotate = DAG.getBitcast(
VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
DAG.getBitcast(ByteVT, Lo),
- DAG.getConstant(Scale * RotAmt, DL, MVT::i8)));
+ DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
@@ -11268,7 +11554,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
"512-bit PALIGNR requires BWI instructions");
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
- DAG.getConstant(ByteRotation, DL, MVT::i8)));
+ DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
}
assert(VT.is128BitVector() &&
@@ -11282,10 +11568,12 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
int LoByteShift = 16 - ByteRotation;
int HiByteShift = ByteRotation;
- SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
- DAG.getConstant(LoByteShift, DL, MVT::i8));
- SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
- DAG.getConstant(HiByteShift, DL, MVT::i8));
+ SDValue LoShift =
+ DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
+ DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
+ SDValue HiShift =
+ DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
+ DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
return DAG.getBitcast(VT,
DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
}
@@ -11317,7 +11605,7 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
return SDValue();
return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
- DAG.getConstant(Rotation, DL, MVT::i8));
+ DAG.getTargetConstant(Rotation, DL, MVT::i8));
}
/// Try to lower a vector shuffle as a byte shift sequence.
@@ -11356,27 +11644,27 @@ static SDValue lowerVectorShuffleAsByteShiftMask(
if (ZeroLo == 0) {
unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * ZeroHi, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
} else if (ZeroHi == 0) {
unsigned Shift = Mask[ZeroLo] % NumElts;
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
} else if (!Subtarget.hasSSSE3()) {
// If we don't have PSHUFB then its worth avoiding an AND constant mask
// by performing 3 byte shifts. Shuffle combining can kick in above that.
// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Shift += Mask[ZeroLo] % NumElts;
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
} else
return SDValue();
@@ -11498,7 +11786,7 @@ static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
"Illegal integer vector type");
V = DAG.getBitcast(ShiftVT, V);
V = DAG.getNode(Opcode, DL, ShiftVT, V,
- DAG.getConstant(ShiftAmt, DL, MVT::i8));
+ DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
return DAG.getBitcast(VT, V);
}
@@ -11632,14 +11920,14 @@ static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
uint64_t BitLen, BitIdx;
if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
- DAG.getConstant(BitLen, DL, MVT::i8),
- DAG.getConstant(BitIdx, DL, MVT::i8));
+ DAG.getTargetConstant(BitLen, DL, MVT::i8),
+ DAG.getTargetConstant(BitIdx, DL, MVT::i8));
if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
V2 ? V2 : DAG.getUNDEF(VT),
- DAG.getConstant(BitLen, DL, MVT::i8),
- DAG.getConstant(BitIdx, DL, MVT::i8));
+ DAG.getTargetConstant(BitLen, DL, MVT::i8),
+ DAG.getTargetConstant(BitIdx, DL, MVT::i8));
return SDValue();
}
@@ -11686,9 +11974,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
};
- // Found a valid zext mask! Try various lowering strategies based on the
+ // Found a valid a/zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
- // TODO: Add AnyExt support.
if (Subtarget.hasSSE41()) {
// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
// PUNPCK will catch this in a later shuffle match.
@@ -11697,7 +11984,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
InputV = ShuffleOffset(InputV);
- InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG);
+ InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,
+ ExtVT, InputV, DAG);
return DAG.getBitcast(VT, InputV);
}
@@ -11736,8 +12024,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
int LoIdx = Offset * EltBits;
SDValue Lo = DAG.getBitcast(
MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
- DAG.getConstant(EltBits, DL, MVT::i8),
- DAG.getConstant(LoIdx, DL, MVT::i8)));
+ DAG.getTargetConstant(EltBits, DL, MVT::i8),
+ DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
return DAG.getBitcast(VT, Lo);
@@ -11745,8 +12033,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
int HiIdx = (Offset + 1) * EltBits;
SDValue Hi = DAG.getBitcast(
MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
- DAG.getConstant(EltBits, DL, MVT::i8),
- DAG.getConstant(HiIdx, DL, MVT::i8)));
+ DAG.getTargetConstant(EltBits, DL, MVT::i8),
+ DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
return DAG.getBitcast(VT,
DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
}
@@ -11759,8 +12047,12 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
SDValue PSHUFBMask[16];
for (int i = 0; i < 16; ++i) {
int Idx = Offset + (i / Scale);
- PSHUFBMask[i] = DAG.getConstant(
- (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
+ if ((i % Scale == 0 && SafeOffset(Idx))) {
+ PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
+ continue;
+ }
+ PSHUFBMask[i] =
+ AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
}
InputV = DAG.getBitcast(MVT::v16i8, InputV);
return DAG.getBitcast(
@@ -12052,9 +12344,9 @@ static SDValue lowerShuffleAsElementInsertion(
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
} else {
V2 = DAG.getBitcast(MVT::v16i8, V2);
- V2 = DAG.getNode(
- X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
- DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
+ V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
+ DAG.getTargetConstant(
+ V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
V2 = DAG.getBitcast(VT, V2);
}
}
@@ -12294,7 +12586,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// If we can't broadcast from a register, check that the input is a load.
if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
return SDValue();
- } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
+ } else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) {
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
@@ -12486,7 +12778,7 @@ static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
// Insert the V2 element into the desired position.
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
}
/// Try to lower a shuffle as a permute of the inputs followed by an
@@ -12635,14 +12927,14 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
- DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
}
return DAG.getNode(
X86ISD::SHUFP, DL, MVT::v2f64,
Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
- DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
}
assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
@@ -12688,7 +12980,7 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
- DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
}
/// Handle lowering of 2-lane 64-bit integer shuffles.
@@ -12996,10 +13288,12 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
- // Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
- Mask, Subtarget, DAG))
- return Broadcast;
+ // Try to use broadcast unless the mask only has one non-undef element.
+ if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+ }
// Straight shuffle of a single input vector. For everything from SSE2
// onward this has a single fast instruction with no scary immediates.
@@ -13680,16 +13974,16 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
if (NumV2Inputs == 0) {
- // Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
- Mask, Subtarget, DAG))
- return Broadcast;
-
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
Zeroable, Subtarget, DAG))
return Shift;
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
@@ -13984,8 +14278,16 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
// Unpack the bytes to form the i16s that will be shuffled into place.
+ bool EvenInUse = false, OddInUse = false;
+ for (int i = 0; i < 16; i += 2) {
+ EvenInUse |= (Mask[i + 0] >= 0);
+ OddInUse |= (Mask[i + 1] >= 0);
+ if (EvenInUse && OddInUse)
+ break;
+ }
V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
- MVT::v16i8, V1, V1);
+ MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
+ OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
for (int i = 0; i < 16; ++i)
@@ -14100,11 +14402,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// First we need to zero all the dropped bytes.
assert(NumEvenDrops <= 3 &&
"No support for dropping even elements more than 3 times.");
- // We use the mask type to pick which bytes are preserved based on how many
- // elements are dropped.
- MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
- SDValue ByteClearMask = DAG.getBitcast(
- MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
+ SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8));
+ for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops)
+ ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8);
+ SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps);
V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
if (!IsSingleInput)
V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
@@ -14448,16 +14749,14 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
}
-/// Lower a vector shuffle crossing multiple 128-bit lanes as
-/// a permutation and blend of those lanes.
+/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
+/// source with a lane permutation.
///
-/// This essentially blends the out-of-lane inputs to each lane into the lane
-/// from a permuted copy of the vector. This lowering strategy results in four
-/// instructions in the worst case for a single-input cross lane shuffle which
-/// is lower than any other fully general cross-lane shuffle strategy I'm aware
-/// of. Special cases for each particular shuffle pattern should be handled
-/// prior to trying this lowering.
-static SDValue lowerShuffleAsLanePermuteAndBlend(
+/// This lowering strategy results in four instructions in the worst case for a
+/// single-input cross lane shuffle which is lower than any other fully general
+/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
+/// shuffle pattern should be handled prior to trying this lowering.
+static SDValue lowerShuffleAsLanePermuteAndShuffle(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// FIXME: This should probably be generalized for 512-bit vectors as well.
@@ -14484,24 +14783,28 @@ static SDValue lowerShuffleAsLanePermuteAndBlend(
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
+ // TODO - we could support shuffling V2 in the Flipped input.
assert(V2.isUndef() &&
"This last part of this routine only works on single input shuffles");
- SmallVector<int, 32> FlippedBlendMask(Size);
- for (int i = 0; i < Size; ++i)
- FlippedBlendMask[i] =
- Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
- ? Mask[i]
- : Mask[i] % LaneSize +
- (i / LaneSize) * LaneSize + Size);
+ SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
+ for (int i = 0; i < Size; ++i) {
+ int &M = InLaneMask[i];
+ if (M < 0)
+ continue;
+ if (((M % Size) / LaneSize) != (i / LaneSize))
+ M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
+ }
+ assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
+ "In-lane shuffle mask expected");
- // Flip the vector, and blend the results which should now be in-lane.
+ // Flip the lanes, and shuffle the results which should now be in-lane.
MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
SDValue Flipped = DAG.getBitcast(PVT, V1);
- Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
- { 2, 3, 0, 1 });
+ Flipped =
+ DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
Flipped = DAG.getBitcast(VT, Flipped);
- return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
+ return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
}
/// Handle lowering 2-lane 128-bit shuffles.
@@ -14565,8 +14868,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
((WidenedMask[1] % 2) << 1);
- return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
- DAG.getConstant(PermMask, DL, MVT::i8));
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
}
}
}
@@ -14598,7 +14901,7 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
V2 = DAG.getUNDEF(VT);
return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
- DAG.getConstant(PermMask, DL, MVT::i8));
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
}
/// Lower a vector shuffle by first fixing the 128-bit lanes and then
@@ -14616,26 +14919,26 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
if (is128BitLaneRepeatedShuffleMask(VT, Mask))
return SDValue();
- int Size = Mask.size();
+ int NumElts = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
- int LaneSize = 128 / VT.getScalarSizeInBits();
- SmallVector<int, 16> RepeatMask(LaneSize, -1);
+ int NumLaneElts = 128 / VT.getScalarSizeInBits();
+ SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
// First pass will try to fill in the RepeatMask from lanes that need two
// sources.
for (int Lane = 0; Lane != NumLanes; ++Lane) {
- int Srcs[2] = { -1, -1 };
- SmallVector<int, 16> InLaneMask(LaneSize, -1);
- for (int i = 0; i != LaneSize; ++i) {
- int M = Mask[(Lane * LaneSize) + i];
+ int Srcs[2] = {-1, -1};
+ SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
+ for (int i = 0; i != NumLaneElts; ++i) {
+ int M = Mask[(Lane * NumLaneElts) + i];
if (M < 0)
continue;
// Determine which of the possible input lanes (NumLanes from each source)
// this element comes from. Assign that as one of the sources for this
// lane. We can assign up to 2 sources for this lane. If we run out
// sources we can't do anything.
- int LaneSrc = M / LaneSize;
+ int LaneSrc = M / NumLaneElts;
int Src;
if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
Src = 0;
@@ -14645,7 +14948,7 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
return SDValue();
Srcs[Src] = LaneSrc;
- InLaneMask[i] = (M % LaneSize) + Src * Size;
+ InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
}
// If this lane has two sources, see if it fits with the repeat mask so far.
@@ -14701,23 +15004,23 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
if (LaneSrcs[Lane][0] >= 0)
continue;
- for (int i = 0; i != LaneSize; ++i) {
- int M = Mask[(Lane * LaneSize) + i];
+ for (int i = 0; i != NumLaneElts; ++i) {
+ int M = Mask[(Lane * NumLaneElts) + i];
if (M < 0)
continue;
// If RepeatMask isn't defined yet we can define it ourself.
if (RepeatMask[i] < 0)
- RepeatMask[i] = M % LaneSize;
+ RepeatMask[i] = M % NumLaneElts;
- if (RepeatMask[i] < Size) {
- if (RepeatMask[i] != M % LaneSize)
+ if (RepeatMask[i] < NumElts) {
+ if (RepeatMask[i] != M % NumLaneElts)
return SDValue();
- LaneSrcs[Lane][0] = M / LaneSize;
+ LaneSrcs[Lane][0] = M / NumLaneElts;
} else {
- if (RepeatMask[i] != ((M % LaneSize) + Size))
+ if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
return SDValue();
- LaneSrcs[Lane][1] = M / LaneSize;
+ LaneSrcs[Lane][1] = M / NumLaneElts;
}
}
@@ -14725,14 +15028,14 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
return SDValue();
}
- SmallVector<int, 16> NewMask(Size, -1);
+ SmallVector<int, 16> NewMask(NumElts, -1);
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Src = LaneSrcs[Lane][0];
- for (int i = 0; i != LaneSize; ++i) {
+ for (int i = 0; i != NumLaneElts; ++i) {
int M = -1;
if (Src >= 0)
- M = Src * LaneSize + i;
- NewMask[Lane * LaneSize + i] = M;
+ M = Src * NumLaneElts + i;
+ NewMask[Lane * NumLaneElts + i] = M;
}
}
SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
@@ -14745,11 +15048,11 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Src = LaneSrcs[Lane][1];
- for (int i = 0; i != LaneSize; ++i) {
+ for (int i = 0; i != NumLaneElts; ++i) {
int M = -1;
if (Src >= 0)
- M = Src * LaneSize + i;
- NewMask[Lane * LaneSize + i] = M;
+ M = Src * NumLaneElts + i;
+ NewMask[Lane * NumLaneElts + i] = M;
}
}
SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
@@ -14760,12 +15063,12 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
return SDValue();
- for (int i = 0; i != Size; ++i) {
- NewMask[i] = RepeatMask[i % LaneSize];
+ for (int i = 0; i != NumElts; ++i) {
+ NewMask[i] = RepeatMask[i % NumLaneElts];
if (NewMask[i] < 0)
continue;
- NewMask[i] += (i / LaneSize) * LaneSize;
+ NewMask[i] += (i / NumLaneElts) * NumLaneElts;
}
return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
}
@@ -14831,14 +15134,13 @@ getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
ArrayRef<int> HalfMask, int HalfIdx1,
int HalfIdx2, bool UndefLower,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG, bool UseConcat = false) {
assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
assert(V1.getValueType().isSimple() && "Expecting only simple types");
MVT VT = V1.getSimpleValueType();
- unsigned NumElts = VT.getVectorNumElements();
- unsigned HalfNumElts = NumElts / 2;
- MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ unsigned HalfNumElts = HalfVT.getVectorNumElements();
auto getHalfVector = [&](int HalfIdx) {
if (HalfIdx < 0)
@@ -14853,6 +15155,14 @@ static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
SDValue Half1 = getHalfVector(HalfIdx1);
SDValue Half2 = getHalfVector(HalfIdx2);
SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+ if (UseConcat) {
+ SDValue Op0 = V;
+ SDValue Op1 = DAG.getUNDEF(HalfVT);
+ if (UndefLower)
+ std::swap(Op0, Op1);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
+ }
+
unsigned Offset = UndefLower ? HalfNumElts : 0;
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
DAG.getIntPtrConstant(Offset, DL));
@@ -14877,9 +15187,8 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
// Upper half is undef and lower half is whole upper subvector.
// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- unsigned NumElts = VT.getVectorNumElements();
- unsigned HalfNumElts = NumElts / 2;
- MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ unsigned HalfNumElts = HalfVT.getVectorNumElements();
if (!UndefLower &&
isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
@@ -15155,11 +15464,19 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
}
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
- unsigned &ShuffleImm, ArrayRef<int> Mask) {
+ bool &ForceV1Zero, bool &ForceV2Zero,
+ unsigned &ShuffleImm, ArrayRef<int> Mask,
+ const APInt &Zeroable) {
int NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) &&
"Unexpected data type for VSHUFPD");
+ assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
+ "Illegal shuffle mask");
+
+ bool ZeroLane[2] = { true, true };
+ for (int i = 0; i < NumElts; ++i)
+ ZeroLane[i & 1] &= Zeroable[i];
// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
@@ -15167,7 +15484,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
bool ShufpdMask = true;
bool CommutableMask = true;
for (int i = 0; i < NumElts; ++i) {
- if (Mask[i] == SM_SentinelUndef)
+ if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
continue;
if (Mask[i] < 0)
return false;
@@ -15180,30 +15497,77 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
ShuffleImm |= (Mask[i] % 2) << i;
}
- if (ShufpdMask)
- return true;
- if (CommutableMask) {
+ if (!ShufpdMask && !CommutableMask)
+ return false;
+
+ if (!ShufpdMask && CommutableMask)
std::swap(V1, V2);
- return true;
- }
- return false;
+ ForceV1Zero = ZeroLane[0];
+ ForceV2Zero = ZeroLane[1];
+ return true;
}
-static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
- assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
+static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD");
unsigned Immediate = 0;
- if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
+ bool ForceV1Zero = false, ForceV2Zero = false;
+ if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
+ Mask, Zeroable))
return SDValue();
+ // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+ if (ForceV1Zero)
+ V1 = getZeroVector(VT, Subtarget, DAG, DL);
+ if (ForceV2Zero)
+ V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
- DAG.getConstant(Immediate, DL, MVT::i8));
+ DAG.getTargetConstant(Immediate, DL, MVT::i8));
}
+// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
+// by zeroable elements in the remaining 24 elements. Turn this into two
+// vmovqb instructions shuffled together.
+static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ SelectionDAG &DAG) {
+ assert(VT == MVT::v32i8 && "Unexpected type!");
+
+ // The first 8 indices should be every 8th element.
+ if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
+ return SDValue();
+
+ // Remaining elements need to be zeroable.
+ if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
+ return SDValue();
+
+ V1 = DAG.getBitcast(MVT::v4i64, V1);
+ V2 = DAG.getBitcast(MVT::v4i64, V2);
+
+ V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
+ V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
+
+ // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
+ // the upper bits of the result using an unpckldq.
+ SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
+ { 0, 1, 2, 3, 16, 17, 18, 19,
+ 4, 5, 6, 7, 20, 21, 22, 23 });
+ // Insert the unpckldq into a zero vector to widen to v32i8.
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
+ DAG.getConstant(0, DL, MVT::v32i8), Unpack,
+ DAG.getIntPtrConstant(0, DL));
+}
+
+
/// Handle lowering of 4-lane 64-bit floating point shuffles.
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
@@ -15236,7 +15600,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
- DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+ DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
}
// With AVX2 we have direct support for this permutation.
@@ -15256,8 +15620,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
// Otherwise, fall back.
- return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG,
- Subtarget);
+ return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
+ DAG, Subtarget);
}
// Use dedicated unpack instructions for masks that match their pattern.
@@ -15269,7 +15633,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Blend;
// Check if the blend happens to exactly fit that of SHUFPD.
- if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Op;
// If we have one input in place, then we can permute the other input and
@@ -15473,8 +15838,8 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
// Otherwise, fall back.
- return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
+ DAG, Subtarget);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
@@ -15681,8 +16046,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
return V;
- return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
+ DAG, Subtarget);
}
SmallVector<int, 8> RepeatedMask;
@@ -15780,8 +16145,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
- return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG,
- Subtarget);
+ return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
+ DAG, Subtarget);
}
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
@@ -15803,6 +16168,14 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
+ // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
+ // by zeroable elements in the remaining 24 elements. Turn this into two
+ // vmovqb instructions shuffled together.
+ if (Subtarget.hasVLX())
+ if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
+ Mask, Zeroable, DAG))
+ return V;
+
// Otherwise fall back on generic lowering.
return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
Subtarget, DAG);
@@ -15974,7 +16347,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
}
return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
- DAG.getConstant(PermMask, DL, MVT::i8));
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
}
/// Handle lowering of 8-lane 64-bit floating point shuffles.
@@ -15999,7 +16372,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
- DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+ DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
}
SmallVector<int, 4> RepeatedMask;
@@ -16016,7 +16389,8 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Unpck;
// Check if the blend happens to exactly fit that of SHUFPD.
- if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Op;
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
@@ -16389,6 +16763,49 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
}
+static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // Shuffle should be unary.
+ if (!V2.isUndef())
+ return SDValue();
+
+ int ShiftAmt = -1;
+ int NumElts = Mask.size();
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
+ "Unexpected mask index.");
+ if (M < 0)
+ continue;
+
+ // The first non-undef element determines our shift amount.
+ if (ShiftAmt < 0) {
+ ShiftAmt = M - i;
+ // Need to be shifting right.
+ if (ShiftAmt <= 0)
+ return SDValue();
+ }
+ // All non-undef elements must shift by the same amount.
+ if (ShiftAmt != M - i)
+ return SDValue();
+ }
+ assert(ShiftAmt >= 0 && "All undef?");
+
+ // Great we found a shift right.
+ MVT WideVT = VT;
+ if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
+ WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
+ DAG.getUNDEF(WideVT), V1,
+ DAG.getIntPtrConstant(0, DL));
+ Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
+ DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+}
+
// Determine if this shuffle can be implemented with a KSHIFT instruction.
// Returns the shift amount if possible or -1 if not. This is a simplified
// version of matchShuffleAsShift.
@@ -16434,13 +16851,20 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/o basic ISA!");
- unsigned NumElts = Mask.size();
+ int NumElts = Mask.size();
// Try to recognize shuffles that are just padding a subvector with zeros.
- unsigned SubvecElts = 0;
- for (int i = 0; i != (int)NumElts; ++i) {
- if (Mask[i] >= 0 && Mask[i] != i)
- break;
+ int SubvecElts = 0;
+ int Src = -1;
+ for (int i = 0; i != NumElts; ++i) {
+ if (Mask[i] >= 0) {
+ // Grab the source from the first valid mask. All subsequent elements need
+ // to use this same source.
+ if (Src < 0)
+ Src = Mask[i] / NumElts;
+ if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
+ break;
+ }
++SubvecElts;
}
@@ -16451,30 +16875,54 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Make sure the number of zeroable bits in the top at least covers the bits
// not covered by the subvector.
- if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
+ if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
+ assert(Src >= 0 && "Expected a source!");
MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
- V1, DAG.getIntPtrConstant(0, DL));
+ Src == 0 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
- getZeroVector(VT, Subtarget, DAG, DL),
+ DAG.getConstant(0, DL, VT),
Extract, DAG.getIntPtrConstant(0, DL));
}
+ // Try a simple shift right with undef elements. Later we'll try with zeros.
+ if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
+ DAG))
+ return Shift;
+
// Try to match KSHIFTs.
- // TODO: Support narrower than legal shifts by widening and extracting.
- if (NumElts >= 16 || (Subtarget.hasDQI() && NumElts == 8)) {
- unsigned Offset = 0;
- for (SDValue V : { V1, V2 }) {
- unsigned Opcode;
- int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
- if (ShiftAmt >= 0)
- return DAG.getNode(Opcode, DL, VT, V,
- DAG.getConstant(ShiftAmt, DL, MVT::i8));
- Offset += NumElts; // Increment for next iteration.
+ unsigned Offset = 0;
+ for (SDValue V : { V1, V2 }) {
+ unsigned Opcode;
+ int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
+ if (ShiftAmt >= 0) {
+ MVT WideVT = VT;
+ if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
+ WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
+ DAG.getUNDEF(WideVT), V,
+ DAG.getIntPtrConstant(0, DL));
+ // Widened right shifts need two shifts to ensure we shift in zeroes.
+ if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
+ int WideElts = WideVT.getVectorNumElements();
+ // Shift left to put the original vector in the MSBs of the new size.
+ Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
+ DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
+ // Increase the shift amount to account for the left shift.
+ ShiftAmt += WideElts - NumElts;
+ }
+
+ Res = DAG.getNode(Opcode, DL, WideVT, Res,
+ DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
}
+ Offset += NumElts; // Increment for next iteration.
}
+
MVT ExtVT;
switch (VT.SimpleTy) {
default:
@@ -16594,7 +17042,7 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
+ ArrayRef<int> OrigMask = SVOp->getMask();
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
MVT VT = Op.getSimpleValueType();
@@ -16620,8 +17068,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// undef as well. This makes it easier to match the shuffle based solely on
// the mask.
if (V2IsUndef &&
- any_of(Mask, [NumElements](int M) { return M >= NumElements; })) {
- SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+ any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
+ SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
for (int &M : NewMask)
if (M >= NumElements)
M = -1;
@@ -16629,15 +17077,16 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
}
// Check for illegal shuffle mask element index values.
- int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
- assert(llvm::all_of(Mask,
+ int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
+ (void)MaskUpperLimit;
+ assert(llvm::all_of(OrigMask,
[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index");
// We actually see shuffles that are entirely re-arrangements of a set of
// zero inputs. This mostly happens while decomposing complex shuffles into
// simple ones. Directly lower these as a buildvector of zeros.
- APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2);
if (Zeroable.isAllOnesValue())
return getZeroVector(VT, Subtarget, DAG, DL);
@@ -16645,11 +17094,11 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// Create an alternative mask with info about zeroable elements.
// Here we do not set undef elements as zeroable.
- SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
+ SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end());
if (V2IsZero) {
assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
for (int i = 0; i != NumElements; ++i)
- if (Mask[i] != SM_SentinelUndef && Zeroable[i])
+ if (OrigMask[i] != SM_SentinelUndef && Zeroable[i])
ZeroableMask[i] = SM_SentinelZero;
}
@@ -16664,7 +17113,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// by obfuscating the operands with bitcasts.
// TODO: Avoid lowering directly from this top-level function: make this
// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
- if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
Subtarget, DAG))
return Broadcast;
@@ -16700,8 +17149,11 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
}
// Commute the shuffle if it will improve canonicalization.
- if (canonicalizeShuffleMaskWithCommute(Mask))
- return DAG.getCommutedVectorShuffle(*SVOp);
+ SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
+ if (canonicalizeShuffleMaskWithCommute(Mask)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(V1, V2);
+ }
if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
return V;
@@ -16910,7 +17362,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
// Use kshiftr instruction to move to the lower element.
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
DAG.getIntPtrConstant(0, dl));
@@ -17137,8 +17589,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
- N2 = DAG.getIntPtrConstant(1, dl);
- return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
+ DAG.getTargetConstant(1, dl, MVT::i8));
}
}
@@ -17207,14 +17659,14 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// But if optimizing for size and there's a load folding opportunity,
// generate insertps because blendps does not have a 32-bit memory
// operand form.
- N2 = DAG.getIntPtrConstant(1, dl);
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
- return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
+ DAG.getTargetConstant(1, dl, MVT::i8));
}
- N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
+ DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
}
// PINSR* works with constant index.
@@ -17300,7 +17752,7 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
// Shift to the LSB.
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
DAG.getIntPtrConstant(0, dl));
@@ -17841,10 +18293,10 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
std::swap(Op0, Op1);
APInt APIntShiftAmt;
- if (isConstantSplat(Amt, APIntShiftAmt)) {
+ if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
- return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
- Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,
+ Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
}
return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
@@ -17970,6 +18422,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
+ if (VT == MVT::f128)
+ return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
+
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
@@ -18072,6 +18527,16 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
return Result;
}
+/// Horizontal vector math instructions may be slower than normal math with
+/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
+/// implementation, and likely shuffle complexity of the alternate sequence.
+static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool HasFastHOps = Subtarget.hasFastHorizontalOps();
+ return !IsSingleSource || IsOptimizingSize || HasFastHOps;
+}
+
/// 64-bit unsigned integer to double expansion.
static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -18126,8 +18591,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
- if (Subtarget.hasSSE3()) {
- // FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'.
+ if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) {
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
@@ -18273,7 +18737,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
// Low will be bitcasted right away, so do not bother bitcasting back to its
// original type.
Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
- VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+ VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
// (uint4) 0x53000000, 0xaa);
SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
@@ -18281,7 +18745,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
// High will be bitcasted right away, so do not bother bitcasting back to
// its original type.
High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
- VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+ VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
} else {
SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
@@ -18329,16 +18793,18 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
+ MVT SrcVT = N0.getSimpleValueType();
+ MVT DstVT = Op.getSimpleValueType();
- if (Op.getSimpleValueType().isVector())
+ if (DstVT == MVT::f128)
+ return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
+
+ if (DstVT.isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
- MVT SrcVT = N0.getSimpleValueType();
- MVT DstVT = Op.getSimpleValueType();
-
if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
(SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
// Conversions from unsigned i32 to f32/f64 are legal,
@@ -18346,6 +18812,12 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
return Op;
}
+ // Promote i32 to i64 and use a signed conversion on 64-bit targets.
+ if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
+ N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0);
+ return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0);
+ }
+
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;
@@ -18579,7 +19051,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
if (InVT == MVT::v8i8) {
- if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
+ if (VT != MVT::v8i64)
return SDValue();
In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
@@ -18602,10 +19074,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
// Concat upper and lower parts.
//
-
- MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
-
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
// Short-circuit if we can determine that each 128-bit half is the same value.
@@ -18903,9 +19372,29 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");
- // If called by the legalizer just return.
- if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+ // If we're called by the type legalizer, handle a few cases.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(InVT)) {
+ if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
+ VT.is128BitVector()) {
+ assert(Subtarget.hasVLX() && "Unexpected subtarget!");
+ // The default behavior is to truncate one step, concatenate, and then
+ // truncate the remainder. We'd rather produce two 64-bit results and
+ // concatenate those.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
+
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+ Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
+ Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+ }
+
+ // Otherwise let default legalization handle it.
return SDValue();
+ }
if (VT.getVectorElementType() == MVT::i1)
return LowerTruncateVecI1(Op, DAG, Subtarget);
@@ -18940,6 +19429,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
return V;
+ // Handle truncation of V256 to V128 using shuffles.
+ assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
+
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget.hasInt256()) {
@@ -19016,22 +19508,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
}
- // Handle truncation of V256 to V128 using shuffles.
- assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
-
- assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
-
- unsigned NumElems = VT.getVectorNumElements();
- MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
-
- SmallVector<int, 16> MaskVec(NumElems * 2, -1);
- // Prepare truncation shuffle mask
- for (unsigned i = 0; i != NumElems; ++i)
- MaskVec[i] = i * 2;
- In = DAG.getBitcast(NVT, In);
- SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
- DAG.getIntPtrConstant(0, DL));
+ llvm_unreachable("All 256->128 cases should have been handled above!");
}
SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
@@ -19041,6 +19518,17 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
MVT SrcVT = Src.getSimpleValueType();
SDLoc dl(Op);
+ if (SrcVT == MVT::f128) {
+ RTLIB::Libcall LC;
+ if (Op.getOpcode() == ISD::FP_TO_SINT)
+ LC = RTLIB::getFPTOSINT(SrcVT, VT);
+ else
+ LC = RTLIB::getFPTOUINT(SrcVT, VT);
+
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first;
+ }
+
if (VT.isVector()) {
if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
@@ -19075,14 +19563,27 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
- if (!IsSigned && Subtarget.hasAVX512()) {
- // Conversions from f32/f64 should be legal.
- if (UseSSEReg)
+ if (!IsSigned && UseSSEReg) {
+ // Conversions from f32/f64 with AVX512 should be legal.
+ if (Subtarget.hasAVX512())
return Op;
- // Use default expansion.
+ // Use default expansion for i64.
if (VT == MVT::i64)
return SDValue();
+
+ assert(VT == MVT::i32 && "Unexpected VT!");
+
+ // Promote i32 to i64 and use a signed operation on 64-bit targets.
+ if (Subtarget.is64Bit()) {
+ SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
+
+ // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
+ // use fisttp which will be handled later.
+ if (!Subtarget.hasSSE3())
+ return SDValue();
}
// Promote i16 to i32 if we can use a SSE operation.
@@ -19103,12 +19604,17 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
}
-static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
+SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT SVT = In.getSimpleValueType();
+ if (VT == MVT::f128) {
+ RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
+ return LowerF128Call(Op, DAG, LC);
+ }
+
assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
return DAG.getNode(X86ISD::VFPEXT, DL, VT,
@@ -19116,14 +19622,31 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
In, DAG.getUNDEF(SVT)));
}
-/// Horizontal vector math instructions may be slower than normal math with
-/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
-/// implementation, and likely shuffle complexity of the alternate sequence.
-static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
- bool HasFastHOps = Subtarget.hasFastHorizontalOps();
- return !IsSingleSource || IsOptimizingSize || HasFastHOps;
+SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT SVT = In.getSimpleValueType();
+
+ // It's legal except when f128 is involved
+ if (SVT != MVT::f128)
+ return Op;
+
+ RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);
+
+ // FP_ROUND node has a second operand indicating whether it is known to be
+ // precise. That doesn't take part in the LibCall so we can't directly use
+ // LowerF128Call.
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first;
+}
+
+// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking
+// the default expansion of STRICT_FP_ROUND.
+static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) {
+ // FIXME: Need to form a libcall with an input chain for f128.
+ assert(Op.getOperand(0).getValueType() != MVT::f128 &&
+ "Don't know how to handle f128 yet!");
+ return Op;
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
@@ -19200,8 +19723,13 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
-static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
+ if (Op.getValueType() == MVT::f128) {
+ RTLIB::Libcall LC = Op.getOpcode() == ISD::FADD ? RTLIB::ADD_F128
+ : RTLIB::SUB_F128;
+ return LowerF128Call(Op, DAG, LC);
+ }
+
assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
"Only expecting float/double");
return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
@@ -19358,13 +19886,13 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
SelectionDAG &DAG) {
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
+ DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
}
/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
/// style scalarized (associative) reduction patterns.
-static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp,
- SmallVectorImpl<SDValue> &SrcOps) {
+static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
+ SmallVectorImpl<SDValue> &SrcOps) {
SmallVector<SDValue, 8> Opnds;
DenseMap<SDValue, APInt> SrcOpMap;
EVT VT = MVT::Other;
@@ -19437,7 +19965,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
return SDValue();
SmallVector<SDValue, 8> VecIns;
- if (!matchBitOpReduction(Op, ISD::OR, VecIns))
+ if (!matchScalarReduction(Op, ISD::OR, VecIns))
return SDValue();
// Quit if not 128/256-bit vector.
@@ -19461,8 +19989,8 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
}
- X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL,
- MVT::i8);
+ X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
+ DL, MVT::i8);
return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
}
@@ -19576,6 +20104,13 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case X86ISD::XOR:
case X86ISD::AND:
return SDValue(Op.getNode(), 1);
+ case ISD::SSUBO:
+ case ISD::USUBO: {
+ // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
+ Op->getOperand(1)).getValue(1);
+ }
default:
default_case:
break;
@@ -19766,6 +20301,63 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
return 2;
}
+SDValue
+X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+ SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const {
+ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+ if (isIntDivCheap(N->getValueType(0), Attr))
+ return SDValue(N,0); // Lower SDIV as SDIV
+
+ assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
+ "Unexpected divisor!");
+
+ // Only perform this transform if CMOV is supported otherwise the select
+ // below will become a branch.
+ if (!Subtarget.hasCMov())
+ return SDValue();
+
+ // fold (sdiv X, pow2)
+ EVT VT = N->getValueType(0);
+ // FIXME: Support i8.
+ if (VT != MVT::i16 && VT != MVT::i32 &&
+ !(Subtarget.is64Bit() && VT == MVT::i64))
+ return SDValue();
+
+ unsigned Lg2 = Divisor.countTrailingZeros();
+
+ // If the divisor is 2 or -2, the default expansion is better.
+ if (Lg2 == 1)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
+ SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
+
+ // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
+ SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+ SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
+
+ Created.push_back(Cmp.getNode());
+ Created.push_back(Add.getNode());
+ Created.push_back(CMov.getNode());
+
+ // Divide by pow2.
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i64));
+
+ // If we're dividing by a positive value, we're done. Otherwise, we must
+ // negate the result.
+ if (Divisor.isNonNegative())
+ return SRA;
+
+ Created.push_back(SRA.getNode());
+ return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
+}
+
/// Result of 'and' is compared against zero. Change to a BT node if possible.
/// Returns the BT node and the condition code needed to use it.
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
@@ -19842,8 +20434,8 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
if (Src.getValueType() != BitNo.getValueType())
BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
- X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
- dl, MVT::i8);
+ X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
+ dl, MVT::i8);
return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
}
@@ -19935,13 +20527,6 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
- // If this is a seteq make sure any build vectors of all zeros are on the RHS.
- // This helps with vptestm matching.
- // TODO: Should we just canonicalize the setcc during DAG combine?
- if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
- ISD::isBuildVectorAllZeros(Op0.getNode()))
- std::swap(Op0, Op1);
-
// Prefer SETGT over SETLT.
if (SetCCOpcode == ISD::SETLT) {
SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
@@ -20007,7 +20592,7 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
// Only do this pre-AVX since vpcmp* is no longer destructive.
if (Subtarget.hasAVX())
return SDValue();
- SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false);
+ SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
if (!ULEOp1)
return SDValue();
Op1 = ULEOp1;
@@ -20018,7 +20603,7 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
// This is beneficial because materializing a constant 0 for the PCMPEQ is
// probably cheaper than XOR+PCMPGT using 2 different vector constants:
// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
- SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true);
+ SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
if (!UGEOp1)
return SDValue();
Op1 = Op0;
@@ -20086,14 +20671,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(CC0, dl, MVT::i8));
+ DAG.getTargetConstant(CC0, dl, MVT::i8));
SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(CC1, dl, MVT::i8));
+ DAG.getTargetConstant(CC1, dl, MVT::i8));
Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
} else {
// Handle all other FP comparisons here.
Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(SSECC, dl, MVT::i8));
+ DAG.getTargetConstant(SSECC, dl, MVT::i8));
}
// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
@@ -20106,16 +20691,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
MVT VTOp0 = Op0.getSimpleValueType();
+ (void)VTOp0;
assert(VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!");
assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!");
- // This is being called by type legalization because v2i32 is marked custom
- // for result type legalization for v2f32.
- if (VTOp0 == MVT::v2i32)
- return SDValue();
-
// The non-AVX512 code below works under the assumption that source and
// destination types are the same.
assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
@@ -20153,7 +20734,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
return DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(CmpMode, dl, MVT::i8));
+ DAG.getTargetConstant(CmpMode, dl, MVT::i8));
}
// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
@@ -20222,21 +20803,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
TLI.isOperationLegal(ISD::UMIN, VT)) {
// If we have a constant operand, increment/decrement it and change the
// condition to avoid an invert.
- if (Cond == ISD::SETUGT &&
- ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
- return !C->getAPIntValue().isMaxValue();
- })) {
+ if (Cond == ISD::SETUGT) {
// X > C --> X >= (C+1) --> X == umax(X, C+1)
- Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT));
- Cond = ISD::SETUGE;
+ if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
+ Op1 = UGTOp1;
+ Cond = ISD::SETUGE;
+ }
}
- if (Cond == ISD::SETULT &&
- ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
- return !C->getAPIntValue().isNullValue();
- })) {
+ if (Cond == ISD::SETULT) {
// X < C --> X <= (C-1) --> X == umin(X, C-1)
- Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT));
- Cond = ISD::SETULE;
+ if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
+ Op1 = ULTOp1;
+ Cond = ISD::SETULE;
+ }
}
bool Invert = false;
unsigned Opc;
@@ -20360,11 +20939,11 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
return Result;
}
-// Try to select this as a KORTEST+SETCC if possible.
-static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
- const SDLoc &dl, SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- SDValue &X86CC) {
+// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
+static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SDValue &X86CC) {
// Only support equality comparisons.
if (CC != ISD::SETEQ && CC != ISD::SETNE)
return SDValue();
@@ -20389,6 +20968,21 @@ static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
} else
return SDValue();
+ // If the input is an AND, we can combine it's operands into the KTEST.
+ bool KTestable = false;
+ if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
+ KTestable = true;
+ if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
+ KTestable = true;
+ if (!isNullConstant(Op1))
+ KTestable = false;
+ if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
+ SDValue LHS = Op0.getOperand(0);
+ SDValue RHS = Op0.getOperand(1);
+ X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
+ }
+
// If the input is an OR, we can combine it's operands into the KORTEST.
SDValue LHS = Op0;
SDValue RHS = Op0;
@@ -20397,7 +20991,7 @@ static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
RHS = Op0.getOperand(1);
}
- X86CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+ X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
}
@@ -20425,9 +21019,9 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
return PTEST;
}
- // Try to lower using KORTEST.
- if (SDValue KORTEST = EmitKORTEST(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
- return KORTEST;
+ // Try to lower using KORTEST or KTEST.
+ if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
+ return Test;
// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
// these.
@@ -20442,7 +21036,7 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
if (Invert) {
X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
CCode = X86::GetOppositeBranchCondition(CCode);
- X86CC = DAG.getConstant(CCode, dl, MVT::i8);
+ X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
}
return Op0.getOperand(1);
@@ -20456,7 +21050,7 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG);
EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
- X86CC = DAG.getConstant(CondCode, dl, MVT::i8);
+ X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
return EFLAGS;
}
@@ -20472,6 +21066,19 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ // Handle f128 first, since one possible outcome is a normal integer
+ // comparison which gets handled by emitFlagsForSetcc.
+ if (Op0.getValueType() == MVT::f128) {
+ softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1);
+
+ // If softenSetCCOperands returned a scalar, use it.
+ if (!Op1.getNode()) {
+ assert(Op0.getValueType() == Op.getValueType() &&
+ "Unexpected setcc expansion!");
+ return Op0;
+ }
+ }
+
SDValue X86CC;
SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
if (!EFLAGS)
@@ -20612,15 +21219,16 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
if (Subtarget.hasAVX512()) {
- SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
- CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
+ SDValue Cmp =
+ DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
+ DAG.getTargetConstant(SSECC, DL, MVT::i8));
assert(!VT.isVector() && "Not a scalar type?");
return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
}
if (SSECC < 8 || Subtarget.hasAVX()) {
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
- DAG.getConstant(SSECC, DL, MVT::i8));
+ DAG.getTargetConstant(SSECC, DL, MVT::i8));
// If we have AVX, we can use a variable vector select (VBLENDV) instead
// of 3 logic instructions for size savings and potentially speed.
@@ -20718,8 +21326,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
isNullConstant(Cond.getOperand(1).getOperand(1))) {
SDValue Cmp = Cond.getOperand(1);
- unsigned CondCode =
- cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+ unsigned CondCode = Cond.getConstantOperandVal(0);
if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
@@ -20807,8 +21414,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
CC = Cond.getOperand(0);
SDValue Cmp = Cond.getOperand(1);
- MVT VT = Op.getSimpleValueType();
-
bool IllegalFPCMov = false;
if (VT.isFloatingPoint() && !VT.isVector() &&
!isScalarFPTypeInSSEReg(VT)) // FPStack?
@@ -20826,7 +21431,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
X86::CondCode X86Cond;
std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
- CC = DAG.getConstant(X86Cond, DL, MVT::i8);
+ CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
AddTest = false;
}
@@ -20848,7 +21453,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
if (AddTest) {
- CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
+ CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),
X86::COND_NE, DL, DAG);
}
@@ -20864,9 +21469,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(isNullConstant(Op1) || isNullConstant(Op2))) {
- SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
- Cond);
+ SDValue Res =
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
return DAG.getNOT(DL, Res, Res.getValueType());
return Res;
@@ -21037,8 +21642,8 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
if (Subtarget.hasAVX()) {
assert(VT.is256BitVector() && "256-bit vector expected");
- int HalfNumElts = NumElts / 2;
- MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts);
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ int HalfNumElts = HalfVT.getVectorNumElements();
unsigned NumSrcElts = InVT.getVectorNumElements();
SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
@@ -21081,7 +21686,7 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
- DAG.getConstant(SignExtShift, dl, MVT::i8));
+ DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
}
if (VT == MVT::v2i64) {
@@ -21119,7 +21724,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
if (InVT == MVT::v8i8) {
- if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
+ if (VT != MVT::v8i64)
return SDValue();
In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
@@ -21138,10 +21743,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
// for v4i32 the high shuffle mask will be {2, 3, -1, -1}
// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
// concat the vectors to original VT
-
- MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
-
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
unsigned NumElems = InVT.getVectorNumElements();
@@ -21165,7 +21767,7 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
// Splitting volatile memory ops is not allowed unless the operation was not
// legal to begin with. We are assuming the input op is legal (this transform
// is only used for targets with AVX).
- if (Store->isVolatile())
+ if (!Store->isSimple())
return SDValue();
MVT StoreVT = StoredVal.getSimpleValueType();
@@ -21201,7 +21803,7 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
// Splitting volatile memory ops is not allowed unless the operation was not
// legal to begin with. We are assuming the input op is legal (this transform
// is only used for targets with AVX).
- if (Store->isVolatile())
+ if (!Store->isSimple())
return SDValue();
MVT StoreSVT = StoreVT.getScalarType();
@@ -21266,14 +21868,13 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
return SDValue();
}
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
"Unexpected VT");
- if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
- TargetLowering::TypeWidenVector)
- return SDValue();
+ assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
+ TargetLowering::TypeWidenVector && "Unexpected type action!");
- MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
- StoreVT.getVectorNumElements() * 2);
+ EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
DAG.getUNDEF(StoreVT));
@@ -21313,11 +21914,10 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
SDLoc dl(Ld);
- EVT MemVT = Ld->getMemoryVT();
// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
if (RegVT.getVectorElementType() == MVT::i1) {
- assert(EVT(RegVT) == MemVT && "Expected non-extending load");
+ assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI");
@@ -21336,176 +21936,7 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
}
- // Nothing useful we can do without SSE2 shuffles.
- assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
-
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- unsigned RegSz = RegVT.getSizeInBits();
-
- ISD::LoadExtType Ext = Ld->getExtensionType();
-
- assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
- && "Only anyext and sext are currently implemented.");
- assert(MemVT != RegVT && "Cannot extend to the same type");
- assert(MemVT.isVector() && "Must load a vector from memory");
-
- unsigned NumElems = RegVT.getVectorNumElements();
- unsigned MemSz = MemVT.getSizeInBits();
- assert(RegSz > MemSz && "Register size must be greater than the mem size");
-
- if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
- // The only way in which we have a legal 256-bit vector result but not the
- // integer 256-bit operations needed to directly lower a sextload is if we
- // have AVX1 but not AVX2. In that case, we can always emit a sextload to
- // a 128-bit vector and a normal sign_extend to 256-bits that should get
- // correctly legalized. We do this late to allow the canonical form of
- // sextload to persist throughout the rest of the DAG combiner -- it wants
- // to fold together any extensions it can, and so will fuse a sign_extend
- // of an sextload into a sextload targeting a wider value.
- SDValue Load;
- if (MemSz == 128) {
- // Just switch this to a normal load.
- assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
- "it must be a legal 128-bit vector "
- "type!");
- Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), Ld->getAlignment(),
- Ld->getMemOperand()->getFlags());
- } else {
- assert(MemSz < 128 &&
- "Can't extend a type wider than 128 bits to a 256 bit vector!");
- // Do an sext load to a 128-bit vector type. We want to use the same
- // number of elements, but elements half as wide. This will end up being
- // recursively lowered by this routine, but will succeed as we definitely
- // have all the necessary features if we're using AVX1.
- EVT HalfEltVT =
- EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
- EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
- Load =
- DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
- Ld->getMemOperand()->getFlags());
- }
-
- // Replace chain users with the new chain.
- assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
-
- // Finally, do a normal sign-extend to the desired register.
- SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT);
- return DAG.getMergeValues({SExt, Load.getValue(1)}, dl);
- }
-
- // All sizes must be a power of two.
- assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
- "Non-power-of-two elements are not custom lowered!");
-
- // Attempt to load the original value using scalar loads.
- // Find the largest scalar type that divides the total loaded size.
- MVT SclrLoadTy = MVT::i8;
- for (MVT Tp : MVT::integer_valuetypes()) {
- if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
- SclrLoadTy = Tp;
- }
- }
-
- // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
- if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
- (64 <= MemSz))
- SclrLoadTy = MVT::f64;
-
- // Calculate the number of scalar loads that we need to perform
- // in order to load our vector from memory.
- unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
-
- assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
- "Can only lower sext loads with a single scalar load!");
-
- unsigned loadRegSize = RegSz;
- if (Ext == ISD::SEXTLOAD && RegSz >= 256)
- loadRegSize = 128;
-
- // If we don't have BWI we won't be able to create the shuffle needed for
- // v8i8->v8i64.
- if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
- MemVT == MVT::v8i8)
- loadRegSize = 128;
-
- // Represent our vector as a sequence of elements which are the
- // largest scalar that we can load.
- EVT LoadUnitVecVT = EVT::getVectorVT(
- *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits());
-
- // Represent the data using the same element type that is stored in
- // memory. In practice, we ''widen'' MemVT.
- EVT WideVecVT =
- EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
- loadRegSize / MemVT.getScalarSizeInBits());
-
- assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
- "Invalid vector type");
-
- // We can't shuffle using an illegal type.
- assert(TLI.isTypeLegal(WideVecVT) &&
- "We only lower types that form legal widened vector types");
-
- SmallVector<SDValue, 8> Chains;
- SDValue Ptr = Ld->getBasePtr();
- unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8;
- SDValue Increment = DAG.getConstant(OffsetInc, dl,
- TLI.getPointerTy(DAG.getDataLayout()));
- SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
-
- unsigned Offset = 0;
- for (unsigned i = 0; i < NumLoads; ++i) {
- unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset);
-
- // Perform a single load.
- SDValue ScalarLoad =
- DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr,
- Ld->getPointerInfo().getWithOffset(Offset),
- NewAlign, Ld->getMemOperand()->getFlags());
- Chains.push_back(ScalarLoad.getValue(1));
- // Create the first element type using SCALAR_TO_VECTOR in order to avoid
- // another round of DAGCombining.
- if (i == 0)
- Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
- else
- Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
- ScalarLoad, DAG.getIntPtrConstant(i, dl));
-
- Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
- Offset += OffsetInc;
- }
-
- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
-
- // Bitcast the loaded value to a vector of the original element type, in
- // the size of the target vector type.
- SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
- unsigned SizeRatio = RegSz / MemSz;
-
- if (Ext == ISD::SEXTLOAD) {
- SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG);
- return DAG.getMergeValues({Sext, TF}, dl);
- }
-
- if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
- MemVT == MVT::v8i8) {
- SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG);
- return DAG.getMergeValues({Sext, TF}, dl);
- }
-
- // Redistribute the loaded elements into the different locations.
- SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i * SizeRatio] = i;
-
- SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
- DAG.getUNDEF(WideVecVT), ShuffleVec);
-
- // Bitcast to the requested type.
- Shuff = DAG.getBitcast(RegVT, Shuff);
- return DAG.getMergeValues({Shuff, TF}, dl);
+ return SDValue();
}
/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
@@ -21610,7 +22041,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (Inverted)
X86Cond = X86::GetOppositeBranchCondition(X86Cond);
- CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+ CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
addTest = false;
} else {
unsigned CondOpc;
@@ -21638,10 +22069,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (Cmp == Cond.getOperand(1).getOperand(1) &&
isX86LogicalCmp(Cmp) &&
Op.getNode()->hasOneUse()) {
- X86::CondCode CCode =
- (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
- CCode = X86::GetOppositeBranchCondition(CCode);
- CC = DAG.getConstant(CCode, dl, MVT::i8);
+ X86::CondCode CCode0 =
+ (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode0 = X86::GetOppositeBranchCondition(CCode0);
+ CC = DAG.getTargetConstant(CCode0, dl, MVT::i8);
SDNode *User = *Op.getNode()->use_begin();
// Look for an unconditional branch following this conditional branch.
// We need this because we need to reverse the successors in order
@@ -21654,12 +22085,12 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
(void)NewBR;
Dest = FalseBB;
- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
- Chain, Dest, CC, Cmp);
- X86::CondCode CCode =
- (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
- CCode = X86::GetOppositeBranchCondition(CCode);
- CC = DAG.getConstant(CCode, dl, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain,
+ Dest, CC, Cmp);
+ X86::CondCode CCode1 =
+ (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+ CCode1 = X86::GetOppositeBranchCondition(CCode1);
+ CC = DAG.getTargetConstant(CCode1, dl, MVT::i8);
Cond = Cmp;
addTest = false;
}
@@ -21672,7 +22103,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
X86::CondCode CCode =
(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
CCode = X86::GetOppositeBranchCondition(CCode);
- CC = DAG.getConstant(CCode, dl, MVT::i8);
+ CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
Cond = Cond.getOperand(0).getOperand(1);
addTest = false;
} else if (Cond.getOpcode() == ISD::SETCC &&
@@ -21698,10 +22129,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
Cond.getOperand(0), Cond.getOperand(1));
Cmp = ConvertCmpIfNecessary(Cmp, DAG);
- CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+ CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
- CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
+ CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
Cond = Cmp;
addTest = false;
}
@@ -21714,10 +22145,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
Cond.getOperand(0), Cond.getOperand(1));
Cmp = ConvertCmpIfNecessary(Cmp, DAG);
- CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+ CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
- CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
+ CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
Cond = Cmp;
addTest = false;
}
@@ -21742,7 +22173,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (addTest) {
X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
- CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+ CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),
X86Cond, dl, DAG);
}
@@ -21770,7 +22201,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
- unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ unsigned Align = Op.getConstantOperandVal(2);
EVT VT = Node->getValueType(0);
// Chain the dynamic stack allocation so that it doesn't modify the stack
@@ -21811,7 +22242,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
}
const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
- unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+ Register Vreg = MRI.createVirtualRegister(AddrRegClass);
Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
DAG.getRegister(Vreg, SPTy));
@@ -21821,7 +22252,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- unsigned SPReg = RegInfo->getStackRegister();
+ Register SPReg = RegInfo->getStackRegister();
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
@@ -22076,7 +22507,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
}
return DAG.getNode(Opc, dl, VT, SrcOp,
- DAG.getConstant(ShiftAmt, dl, MVT::i8));
+ DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
}
/// Handle vector element shifts where the shift amount may or may not be a
@@ -22121,7 +22552,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
MVT::v2i64, ShAmt);
else {
- SDValue ByteShift = DAG.getConstant(
+ SDValue ByteShift = DAG.getTargetConstant(
(128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
@@ -22308,13 +22739,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// Helper to detect if the operand is CUR_DIRECTION rounding mode.
auto isRoundModeCurDirection = [](SDValue Rnd) {
if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
- return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
+ return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
return false;
};
auto isRoundModeSAE = [](SDValue Rnd) {
- if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
- return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC;
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
+ unsigned RC = C->getZExtValue();
+ if (RC & X86::STATIC_ROUNDING::NO_EXC) {
+ // Clear the NO_EXC bit and check remaining bits.
+ RC ^= X86::STATIC_ROUNDING::NO_EXC;
+ // As a convenience we allow no other bits or explicitly
+ // current direction.
+ return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
+ }
+ }
return false;
};
@@ -22335,7 +22774,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
};
SDLoc dl(Op);
- unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned IntNo = Op.getConstantOperandVal(0);
MVT VT = Op.getSimpleValueType();
const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
if (IntrData) {
@@ -22411,9 +22850,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
- if (IntrData->Type == INTR_TYPE_3OP_IMM8)
- Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
-
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -22666,7 +23102,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case CMP_MASK_CC: {
MVT MaskVT = Op.getSimpleValueType();
SDValue CC = Op.getOperand(3);
- CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -22685,7 +23120,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case CMP_MASK_SCALAR_CC: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
- SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
+ SDValue CC = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
SDValue Cmp;
@@ -22750,16 +23185,16 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case COMI_RM: { // Comparison intrinsics with Sae
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
- unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ unsigned CondVal = Op.getConstantOperandVal(3);
SDValue Sae = Op.getOperand(4);
SDValue FCmp;
if (isRoundModeCurDirection(Sae))
FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
- DAG.getConstant(CondVal, dl, MVT::i8));
+ DAG.getTargetConstant(CondVal, dl, MVT::i8));
else if (isRoundModeSAE(Sae))
FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
- DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+ DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
else
return SDValue();
// Need to fill with zeros to ensure the bitcast will produce zeroes
@@ -22819,9 +23254,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
// Clear the upper bits of the rounding immediate so that the legacy
// intrinsic can't trigger the scaling behavior of VRNDSCALE.
- SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
- Op.getOperand(2),
- DAG.getConstant(0xf, dl, MVT::i32));
+ auto Round = cast<ConstantSDNode>(Op.getOperand(2));
+ SDValue RoundingMode =
+ DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), RoundingMode);
}
@@ -22829,12 +23264,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
// Clear the upper bits of the rounding immediate so that the legacy
// intrinsic can't trigger the scaling behavior of VRNDSCALE.
- SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
- Op.getOperand(3),
- DAG.getConstant(0xf, dl, MVT::i32));
+ auto Round = cast<ConstantSDNode>(Op.getOperand(3));
+ SDValue RoundingMode =
+ DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), RoundingMode);
}
+ case BEXTRI: {
+ assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode");
+
+ // The control is a TargetConstant, but we need to convert it to a
+ // ConstantSDNode.
+ uint64_t Imm = Op.getConstantOperandVal(2);
+ SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1), Control);
+ }
// ADC/ADCX/SBB
case ADX: {
SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
@@ -23165,6 +23610,61 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
MaskVT, Operation);
return DAG.getMergeValues({Result0, Result1}, DL);
}
+ case Intrinsic::x86_mmx_pslli_w:
+ case Intrinsic::x86_mmx_pslli_d:
+ case Intrinsic::x86_mmx_pslli_q:
+ case Intrinsic::x86_mmx_psrli_w:
+ case Intrinsic::x86_mmx_psrli_d:
+ case Intrinsic::x86_mmx_psrli_q:
+ case Intrinsic::x86_mmx_psrai_w:
+ case Intrinsic::x86_mmx_psrai_d: {
+ SDLoc DL(Op);
+ SDValue ShAmt = Op.getOperand(2);
+ // If the argument is a constant, convert it to a target constant.
+ if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
+ ShAmt = DAG.getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+ Op.getOperand(0), Op.getOperand(1), ShAmt);
+ }
+
+ unsigned NewIntrinsic;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_mmx_pslli_w:
+ NewIntrinsic = Intrinsic::x86_mmx_psll_w;
+ break;
+ case Intrinsic::x86_mmx_pslli_d:
+ NewIntrinsic = Intrinsic::x86_mmx_psll_d;
+ break;
+ case Intrinsic::x86_mmx_pslli_q:
+ NewIntrinsic = Intrinsic::x86_mmx_psll_q;
+ break;
+ case Intrinsic::x86_mmx_psrli_w:
+ NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
+ break;
+ case Intrinsic::x86_mmx_psrli_d:
+ NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
+ break;
+ case Intrinsic::x86_mmx_psrli_q:
+ NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
+ break;
+ case Intrinsic::x86_mmx_psrai_w:
+ NewIntrinsic = Intrinsic::x86_mmx_psra_w;
+ break;
+ case Intrinsic::x86_mmx_psrai_d:
+ NewIntrinsic = Intrinsic::x86_mmx_psra_d;
+ break;
+ }
+
+ // The vector shift intrinsics with scalars uses 32b shift amounts but
+ // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
+ // MMX register.
+ ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+ DAG.getConstant(NewIntrinsic, DL, MVT::i32),
+ Op.getOperand(1), ShAmt);
+
+ }
}
}
@@ -23177,7 +23677,9 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
// Scale must be constant.
if (!C)
return SDValue();
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
// If source is undef or we know it won't be used, use a zero vector
@@ -23204,7 +23706,9 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
// Scale must be constant.
if (!C)
return SDValue();
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
VT.getVectorNumElements());
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
@@ -23238,7 +23742,9 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
// Scale must be constant.
if (!C)
return SDValue();
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
Src.getSimpleValueType().getVectorNumElements());
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
@@ -23266,7 +23772,9 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
// Scale must be constant.
if (!C)
return SDValue();
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
MVT MaskVT =
@@ -23435,8 +23943,7 @@ EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-
+ unsigned IntNo = Op.getConstantOperandVal(1);
const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
if (!IntrData) {
switch (IntNo) {
@@ -23538,10 +24045,10 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
// Otherwise return the value from Rand, which is always 0, casted to i32.
- SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
- DAG.getConstant(1, dl, Op->getValueType(1)),
- DAG.getConstant(X86::COND_B, dl, MVT::i8),
- SDValue(Result.getNode(), 1) };
+ SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+ DAG.getConstant(1, dl, Op->getValueType(1)),
+ DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
+ SDValue(Result.getNode(), 1)};
SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
// Return { result, isValid, chain }.
@@ -23581,8 +24088,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
Scale, Chain, Subtarget);
}
case PREFETCH: {
- SDValue Hint = Op.getOperand(6);
- unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
+ const APInt &HintVal = Op.getConstantOperandAPInt(6);
assert((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3");
unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
@@ -23678,7 +24184,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
return SDValue();
- unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned Depth = Op.getConstantOperandVal(0);
SDLoc dl(Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -23730,7 +24236,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
unsigned FrameReg =
RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
SDLoc dl(Op); // FIXME probably not meaningful
- unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned Depth = Op.getConstantOperandVal(0);
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
(FrameReg == X86::EBP && VT == MVT::i32)) &&
"Invalid Frame Register!");
@@ -23743,12 +24249,11 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const {
+Register X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const {
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- const MachineFunction &MF = DAG.getMachineFunction();
- unsigned Reg = StringSwitch<unsigned>(RegName)
+ Register Reg = StringSwitch<unsigned>(RegName)
.Case("esp", X86::ESP)
.Case("rsp", X86::RSP)
.Case("ebp", X86::EBP)
@@ -23762,8 +24267,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
#ifndef NDEBUG
else {
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- unsigned FrameReg =
- RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+ Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
"Invalid Frame Register!");
}
@@ -23809,7 +24313,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+ Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
"Invalid Frame Register!");
@@ -23967,6 +24471,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
case CallingConv::X86_FastCall:
case CallingConv::X86_ThisCall:
case CallingConv::Fast:
+ case CallingConv::Tail:
// Pass 'nest' parameter in EAX.
// Must be kept in sync with X86CallingConv.td
NestReg = X86::EAX;
@@ -24279,12 +24784,9 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
if (Opc == ISD::CTLZ) {
// If src is zero (i.e. bsr sets ZF), returns NumBits.
- SDValue Ops[] = {
- Op,
- DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
- DAG.getConstant(X86::COND_E, dl, MVT::i8),
- Op.getValue(1)
- };
+ SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
+ DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
+ Op.getValue(1)};
Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
}
@@ -24312,12 +24814,9 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
// If src is zero (i.e. bsf sets ZF), returns NumBits.
- SDValue Ops[] = {
- Op,
- DAG.getConstant(NumBits, dl, VT),
- DAG.getConstant(X86::COND_E, dl, MVT::i8),
- Op.getValue(1)
- };
+ SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
+ DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
+ Op.getValue(1)};
return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
}
@@ -24453,7 +24952,7 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
SDValue N0 = Op.getOperand(0);
SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
DAG.getConstant(0, DL, VT), N0);
- SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
+ SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
SDValue(Neg.getNode(), 1)};
return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
}
@@ -25033,7 +25532,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
// Optimize shl/srl/sra with constant shift amount.
APInt APIntShiftAmt;
- if (!isConstantSplat(Amt, APIntShiftAmt))
+ if (!X86::isConstantSplat(Amt, APIntShiftAmt))
return SDValue();
// If the shift amount is out of range, return undef.
@@ -25220,7 +25719,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
}
ConstantSDNode *ND = cast<ConstantSDNode>(Op);
- APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
+ APInt C(SVTBits, ND->getZExtValue());
uint64_t ShAmt = C.getZExtValue();
if (ShAmt >= SVTBits) {
Elts.push_back(DAG.getUNDEF(SVT));
@@ -25502,7 +26001,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
(VT == MVT::v32i8 && Subtarget.hasInt256())) &&
!Subtarget.hasXOP()) {
int NumElts = VT.getVectorNumElements();
- SDValue Cst8 = DAG.getConstant(8, dl, MVT::i8);
+ SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
// Extend constant shift amount to vXi16 (it doesn't matter if the type
// isn't legal).
@@ -25774,7 +26273,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
return DAG.getNode(Op, DL, VT, R,
- DAG.getConstant(RotateAmt, DL, MVT::i8));
+ DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
}
// Else, fall-back on VPROLV/VPRORV.
@@ -25795,7 +26294,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
if (0 <= CstSplatIndex) {
uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
- DAG.getConstant(RotateAmt, DL, MVT::i8));
+ DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
}
// Use general rotate by variable (per-element).
@@ -26032,7 +26531,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// If this is a canonical idempotent atomicrmw w/no uses, we have a better
// lowering available in lowerAtomicArith.
- // TODO: push more cases through this path.
+ // TODO: push more cases through this path.
if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
AI->use_empty())
@@ -26087,10 +26586,22 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
return Loaded;
}
+bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
+ if (!SI.isUnordered())
+ return false;
+ return ExperimentalUnorderedISEL;
+}
+bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
+ if (!LI.isUnordered())
+ return false;
+ return ExperimentalUnorderedISEL;
+}
+
+
/// Emit a locked operation on a stack location which does not change any
/// memory location, but does involve a lock prefix. Location is chosen to be
/// a) very likely accessed only by a single thread to minimize cache traffic,
-/// and b) definitely dereferenceable. Returns the new Chain result.
+/// and b) definitely dereferenceable. Returns the new Chain result.
static SDValue emitLockedStackOp(SelectionDAG &DAG,
const X86Subtarget &Subtarget,
SDValue Chain, SDLoc DL) {
@@ -26099,22 +26610,22 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
// operations issued by the current processor. As such, the location
// referenced is not relevant for the ordering properties of the instruction.
// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
- // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
+ // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
// 2) Using an immediate operand appears to be the best encoding choice
// here since it doesn't require an extra register.
// 3) OR appears to be very slightly faster than ADD. (Though, the difference
// is small enough it might just be measurement noise.)
// 4) When choosing offsets, there are several contributing factors:
// a) If there's no redzone, we default to TOS. (We could allocate a cache
- // line aligned stack object to improve this case.)
+ // line aligned stack object to improve this case.)
// b) To minimize our chances of introducing a false dependence, we prefer
- // to offset the stack usage from TOS slightly.
+ // to offset the stack usage from TOS slightly.
// c) To minimize concerns about cross thread stack usage - in particular,
// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
// captures state in the TOS frame and accesses it from many threads -
// we want to use an offset such that the offset is in a distinct cache
// line from the TOS frame.
- //
+ //
// For a general discussion of the tradeoffs and benchmark results, see:
// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
@@ -26155,10 +26666,10 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
- AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
- cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
- SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
- cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+ AtomicOrdering FenceOrdering =
+ static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
+ SyncScope::ID FenceSSID =
+ static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
// The only fence that needs an instruction is a sequentially-consistent
// cross-thread fence.
@@ -26167,7 +26678,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
if (Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
- SDValue Chain = Op.getOperand(0);
+ SDValue Chain = Op.getOperand(0);
return emitLockedStackOp(DAG, Subtarget, Chain, dl);
}
@@ -26218,6 +26729,17 @@ static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT InVT = V.getSimpleValueType();
+ if (InVT == MVT::v64i8) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
+ Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
+ Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
+ Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
+ Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
+ Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
+ DAG.getConstant(32, DL, MVT::i8));
+ return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
+ }
if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
@@ -26258,8 +26780,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
SDLoc dl(Op);
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
- EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
- DstVT.getVectorNumElements() / 2);
+ MVT CastVT = DstVT.getHalfNumVectorElementsVT();
Lo = DAG.getBitcast(CastVT, Lo);
Hi = DAG.getBitcast(CastVT, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
@@ -26275,53 +26796,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getZExtOrTrunc(V, DL, DstVT);
}
- if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
- SrcVT == MVT::i64) {
- assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
- if (DstVT != MVT::f64 && DstVT != MVT::i64 &&
- !(DstVT == MVT::x86mmx && SrcVT.isVector()))
- // This conversion needs to be expanded.
- return SDValue();
+ assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
+ SrcVT == MVT::i64) && "Unexpected VT!");
- SDLoc dl(Op);
- if (SrcVT.isVector()) {
- // Widen the vector in input in the case of MVT::v2i32.
- // Example: from MVT::v2i32 to MVT::v4i32.
- MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
- SrcVT.getVectorNumElements() * 2);
- Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
- DAG.getUNDEF(SrcVT));
- } else {
- assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
- "Unexpected source type in LowerBITCAST");
- Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
- }
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
+ !(DstVT == MVT::x86mmx && SrcVT.isVector()))
+ // This conversion needs to be expanded.
+ return SDValue();
- MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
- Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
+ SDLoc dl(Op);
+ if (SrcVT.isVector()) {
+ // Widen the vector in input in the case of MVT::v2i32.
+ // Example: from MVT::v2i32 to MVT::v4i32.
+ MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
+ SrcVT.getVectorNumElements() * 2);
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
+ DAG.getUNDEF(SrcVT));
+ } else {
+ assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
+ "Unexpected source type in LowerBITCAST");
+ Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
+ }
- if (DstVT == MVT::x86mmx)
- return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
+ MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
+ Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
- DAG.getIntPtrConstant(0, dl));
- }
+ if (DstVT == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
- assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
- Subtarget.hasMMX() && "Unexpected custom BITCAST");
- assert((DstVT == MVT::i64 ||
- (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
- "Unexpected custom BITCAST");
- // i64 <=> MMX conversions are Legal.
- if (SrcVT==MVT::i64 && DstVT.isVector())
- return Op;
- if (DstVT==MVT::i64 && SrcVT.isVector())
- return Op;
- // MMX <=> MMX conversions are Legal.
- if (SrcVT.isVector() && DstVT.isVector())
- return Op;
- // All other conversions need to be expanded.
- return SDValue();
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
+ DAG.getIntPtrConstant(0, dl));
}
/// Compute the horizontal sum of bytes in V for the elements of VT.
@@ -26549,6 +27054,13 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
+ // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB
+ // lowering.
+ if (VT == MVT::v8i64 || VT == MVT::v16i32) {
+ assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE");
+ return Lower512IntUnary(Op, DAG);
+ }
+
unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported");
@@ -26656,12 +27168,12 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
// seq_cst which isn't SingleThread, everything just needs to be preserved
// during codegen and then dropped. Note that we expect (but don't assume),
// that orderings other than seq_cst and acq_rel have been canonicalized to
- // a store or load.
+ // a store or load.
if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
AN->getSyncScopeID() == SyncScope::System) {
// Prefer a locked operation against a stack location to minimize cache
// traffic. This assumes that stack locations are very likely to be
- // accessed only by the owning thread.
+ // accessed only by the owning thread.
SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
assert(!N->hasAnyUseOfValue(0));
// NOTE: The getUNDEF is needed to give something for the unused result 0.
@@ -26886,12 +27398,13 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Chain = N->getChain();
SDValue BasePtr = N->getBasePtr();
- if (VT == MVT::v2f32) {
+ if (VT == MVT::v2f32 || VT == MVT::v2i32) {
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
// If the index is v2i64 and we have VLX we can use xmm for data and index.
if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
- Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
- DAG.getUNDEF(MVT::v2f32));
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
@@ -26901,30 +27414,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
return SDValue();
}
- if (VT == MVT::v2i32) {
- assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
- Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
- DAG.getUNDEF(MVT::v2i32));
- // If the index is v2i64 and we have VLX we can use xmm for data and index.
- if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
- SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
- SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
- SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
- VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- return SDValue(NewScatter.getNode(), 1);
- }
- // Custom widen all the operands to avoid promotion.
- EVT NewIndexVT = EVT::getVectorVT(
- *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
- Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
- DAG.getUNDEF(Index.getValueType()));
- Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
- DAG.getConstant(0, dl, MVT::v2i1));
- SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
- return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
- Ops, N->getMemOperand());
- }
-
MVT IndexVT = Index.getSimpleValueType();
MVT MaskVT = Mask.getSimpleValueType();
@@ -27160,6 +27649,13 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
return NOOP;
}
+SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
+ RTLIB::Libcall Call) const {
+ SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
+}
+
/// Provide custom lowering hooks for some operations.
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -27206,10 +27702,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+ case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
+ case ISD::STRICT_FP_ROUND: return LowerSTRICT_FP_ROUND(Op, DAG);
case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
case ISD::FADD:
- case ISD::FSUB: return lowerFaddFsub(Op, DAG, Subtarget);
+ case ISD::FSUB: return lowerFaddFsub(Op, DAG);
+ case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+ case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
case ISD::FABS:
case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
@@ -27347,37 +27847,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::MUL: {
EVT VT = N->getValueType(0);
- assert(VT.isVector() && "Unexpected VT");
- if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger &&
- VT.getVectorNumElements() == 2) {
- // Promote to a pattern that will be turned into PMULUDQ.
- SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
- N->getOperand(0));
- SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
- N->getOperand(1));
- SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1);
- Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
- } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
- VT.getVectorElementType() == MVT::i8) {
- // Pre-promote these to vXi16 to avoid op legalization thinking all 16
- // elements are needed.
- MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
- SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
- SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
- SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
- Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
- unsigned NumConcats = 16 / VT.getVectorNumElements();
- SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
- ConcatOps[0] = Res;
- Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
- Results.push_back(Res);
- }
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
+ // Pre-promote these to vXi16 to avoid op legalization thinking all 16
+ // elements are needed.
+ MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+ SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
+ SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
+ SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ unsigned NumConcats = 16 / VT.getVectorNumElements();
+ SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
+ ConcatOps[0] = Res;
+ Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
+ Results.push_back(Res);
return;
}
- case ISD::UADDSAT:
- case ISD::SADDSAT:
- case ISD::USUBSAT:
- case ISD::SSUBSAT:
case X86ISD::VPMADDWD:
case X86ISD::AVG: {
// Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
@@ -27388,6 +27873,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
EVT InVT = N->getOperand(0).getValueType();
assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
"Expected a VT that divides into 128 bits.");
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
unsigned NumConcat = 128 / InVT.getSizeInBits();
EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
@@ -27404,9 +27891,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
- if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
- DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
return;
}
@@ -27435,26 +27919,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Hi);
return;
}
- case ISD::SETCC: {
- // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
- // setCC result type is v2i1 because type legalzation will end up with
- // a v4i1 setcc plus an extend.
- assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
- if (N->getOperand(0).getValueType() != MVT::v2f32 ||
- getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector)
- return;
- SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
- SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
- N->getOperand(0), UNDEF);
- SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
- N->getOperand(1), UNDEF);
- SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
- N->getOperand(2));
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
- DAG.getIntPtrConstant(0, dl));
- Results.push_back(Res);
- return;
- }
// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
case X86ISD::FMINC:
case X86ISD::FMIN:
@@ -27475,7 +27939,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::SREM:
case ISD::UREM: {
EVT VT = N->getValueType(0);
- if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) {
+ if (VT.isVector()) {
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
// If this RHS is a constant splat vector we can widen this and let
// division/remainder by constant optimize it.
// TODO: Can we do something for non-splat?
@@ -27493,17 +27959,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- if (VT == MVT::v2i32) {
- // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the
- // v2i64 and unroll later. But then we create i64 scalar ops which
- // might be slow in 64-bit mode or require a libcall in 32-bit mode.
- Results.push_back(DAG.UnrollVectorOp(N));
- return;
- }
-
- if (VT.isVector())
- return;
-
LLVM_FALLTHROUGH;
}
case ISD::SDIVREM:
@@ -27561,58 +28016,40 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
}
- return;
- }
- case ISD::SIGN_EXTEND_VECTOR_INREG: {
- if (ExperimentalVectorWideningLegalization)
- return;
-
- EVT VT = N->getValueType(0);
- SDValue In = N->getOperand(0);
- EVT InVT = In.getValueType();
- if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
- (InVT == MVT::v16i16 || InVT == MVT::v32i8)) {
- // Custom split this so we can extend i8/i16->i32 invec. This is better
- // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
- // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
- // we allow the sra from the extend to i32 to be shared by the split.
- EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
- InVT.getVectorElementType(),
- InVT.getVectorNumElements() / 2);
- MVT ExtendVT = MVT::getVectorVT(MVT::i32,
- VT.getVectorNumElements());
- In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT,
- In, DAG.getIntPtrConstant(0, dl));
- In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In);
-
- // Fill a vector with sign bits for each element.
- SDValue Zero = DAG.getConstant(0, dl, ExtendVT);
- SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT);
-
- EVT LoVT, HiVT;
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
-
- // Create an unpackl and unpackh to interleave the sign bits then bitcast
- // to vXi64.
- SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits);
- Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
- SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits);
- Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+ if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
+ getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
+ isTypeLegal(MVT::v4i64)) {
+ // Input needs to be split and output needs to widened. Let's use two
+ // VTRUNCs, and shuffle their results together into the wider type.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
- SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
+ Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
+ SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
+ { 0, 1, 2, 3, 16, 17, 18, 19,
+ -1, -1, -1, -1, -1, -1, -1, -1 });
Results.push_back(Res);
return;
}
+
return;
}
+ case ISD::ANY_EXTEND:
+ // Right now, only MVT::v8i8 has Custom action for an illegal type.
+ // It's intended to custom handle the input type.
+ assert(N->getValueType(0) == MVT::v8i8 &&
+ "Do not know how to legalize this Node");
+ return;
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND: {
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
- (InVT == MVT::v4i16 || InVT == MVT::v4i8) &&
- getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) {
+ (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
+ assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
+ "Unexpected type action!");
assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
// Custom split this so we can extend i8/i16->i32 invec. This is better
// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
@@ -27683,27 +28120,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue Src = N->getOperand(0);
EVT SrcVT = Src.getValueType();
- // Promote these manually to avoid over promotion to v2i64. Type
- // legalization will revisit the v2i32 operation for more cleanup.
- if ((VT == MVT::v2i8 || VT == MVT::v2i16) &&
- getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
- // AVX512DQ provides instructions that produce a v2i64 result.
- if (Subtarget.hasDQI())
- return;
-
- SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src);
- Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
- : ISD::AssertSext,
- dl, MVT::v2i32, Res,
- DAG.getValueType(VT.getVectorElementType()));
- Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
- Results.push_back(Res);
- return;
- }
-
if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
- if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
- return;
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
// Try to create a 128 bit vector, but don't exceed a 32 bit element.
unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
@@ -27738,35 +28157,18 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
assert((IsSigned || Subtarget.hasAVX512()) &&
"Can only handle signed conversion without AVX512");
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
- bool Widenv2i32 =
- getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector;
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
if (Src.getValueType() == MVT::v2f64) {
- unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
if (!IsSigned && !Subtarget.hasVLX()) {
- // If v2i32 is widened, we can defer to the generic legalizer.
- if (Widenv2i32)
- return;
- // Custom widen by doubling to a legal vector with. Isel will
- // further widen to v8f64.
- Opc = ISD::FP_TO_UINT;
- Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64,
- Src, DAG.getUNDEF(MVT::v2f64));
+ // If we have VLX we can emit a target specific FP_TO_UINT node,
+ // otherwise we can defer to the generic legalizer which will widen
+ // the input as well. This will be further widened during op
+ // legalization to v8i32<-v8f64.
+ return;
}
+ unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
- if (!Widenv2i32)
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
- DAG.getIntPtrConstant(0, dl));
- Results.push_back(Res);
- return;
- }
- if (SrcVT == MVT::v2f32 &&
- getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
- SDValue Idx = DAG.getIntPtrConstant(0, dl);
- SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
- DAG.getUNDEF(MVT::v2f32));
- Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
- : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
Results.push_back(Res);
return;
}
@@ -27776,6 +28178,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
+ assert(!VT.isVector() && "Vectors should have been handled above!");
+
if (Subtarget.hasDQI() && VT == MVT::i64 &&
(SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
assert(!Subtarget.is64Bit() && "i64 should be legal");
@@ -27847,7 +28251,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::INTRINSIC_W_CHAIN: {
- unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ unsigned IntNo = N->getConstantOperandVal(1);
switch (IntNo) {
default : llvm_unreachable("Do not know how to custom type "
"legalize this intrinsic operation!");
@@ -27905,7 +28309,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
SDValue Result;
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- unsigned BasePtr = TRI->getBaseRegister();
+ Register BasePtr = TRI->getBaseRegister();
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
(BasePtr == X86::RBX || BasePtr == X86::EBX)) {
@@ -28060,34 +28464,33 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- if (SrcVT != MVT::f64 ||
- (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) ||
- getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
+ if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
+ assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
+ "Unexpected type action!");
+ EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
+ SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
+ Results.push_back(Res);
return;
+ }
- unsigned NumElts = DstVT.getVectorNumElements();
- EVT SVT = DstVT.getVectorElementType();
- EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
- SDValue Res;
- Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0));
- Res = DAG.getBitcast(WiderVT, Res);
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
- DAG.getIntPtrConstant(0, dl));
- Results.push_back(Res);
return;
}
case ISD::MGATHER: {
EVT VT = N->getValueType(0);
- if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
+ if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
+ (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
auto *Gather = cast<MaskedGatherSDNode>(N);
SDValue Index = Gather->getIndex();
if (Index.getValueType() != MVT::v2i64)
return;
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
+ EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
SDValue Mask = Gather->getMask();
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
- SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
Gather->getPassThru(),
- DAG.getUNDEF(MVT::v2f32));
+ DAG.getUNDEF(VT));
if (!Subtarget.hasVLX()) {
// We need to widen the mask, but the instruction will only use 2
// of its elements. So we can use undef.
@@ -28098,66 +28501,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
Gather->getBasePtr(), Index, Gather->getScale() };
SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
+ DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl,
Gather->getMemoryVT(), Gather->getMemOperand());
Results.push_back(Res);
Results.push_back(Res.getValue(2));
return;
}
- if (VT == MVT::v2i32) {
- auto *Gather = cast<MaskedGatherSDNode>(N);
- SDValue Index = Gather->getIndex();
- SDValue Mask = Gather->getMask();
- assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
- SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
- Gather->getPassThru(),
- DAG.getUNDEF(MVT::v2i32));
- // If the index is v2i64 we can use it directly.
- if (Index.getValueType() == MVT::v2i64 &&
- (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
- if (!Subtarget.hasVLX()) {
- // We need to widen the mask, but the instruction will only use 2
- // of its elements. So we can use undef.
- Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
- DAG.getUNDEF(MVT::v2i1));
- Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
- }
- SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
- Gather->getBasePtr(), Index, Gather->getScale() };
- SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
- Gather->getMemoryVT(), Gather->getMemOperand());
- SDValue Chain = Res.getValue(2);
- if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
- DAG.getIntPtrConstant(0, dl));
- Results.push_back(Res);
- Results.push_back(Chain);
- return;
- }
- if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
- EVT IndexVT = Index.getValueType();
- EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
- IndexVT.getScalarType(), 4);
- // Otherwise we need to custom widen everything to avoid promotion.
- Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
- DAG.getUNDEF(IndexVT));
- Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
- DAG.getConstant(0, dl, MVT::v2i1));
- SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
- Gather->getBasePtr(), Index, Gather->getScale() };
- SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
- Gather->getMemoryVT(), dl, Ops,
- Gather->getMemOperand());
- SDValue Chain = Res.getValue(1);
- if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
- DAG.getIntPtrConstant(0, dl));
- Results.push_back(Res);
- Results.push_back(Chain);
- return;
- }
- }
return;
}
case ISD::LOAD: {
@@ -28166,8 +28515,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
// cast since type legalization will try to use an i64 load.
MVT VT = N->getSimpleValueType(0);
assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
- if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
- return;
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
if (!ISD::isNON_EXTLoad(N))
return;
auto *Ld = cast<LoadSDNode>(N);
@@ -28177,11 +28526,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Ld->getPointerInfo(), Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
SDValue Chain = Res.getValue(1);
- MVT WideVT = MVT::getVectorVT(LdVT, 2);
- Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
- MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() * 2);
- Res = DAG.getBitcast(CastVT, Res);
+ MVT VecVT = MVT::getVectorVT(LdVT, 2);
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
+ EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
+ Res = DAG.getBitcast(WideVT, Res);
Results.push_back(Res);
Results.push_back(Chain);
return;
@@ -28236,6 +28584,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
case X86ISD::Wrapper: return "X86ISD::Wrapper";
case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
+ case X86ISD::MOVQ2DQ: return "X86ISD::MOVQ2DQ";
case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
@@ -28373,6 +28722,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
+ case X86ISD::VBROADCAST_LOAD: return "X86ISD::VBROADCAST_LOAD";
case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
@@ -28737,6 +29087,9 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
}
bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+ if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
+ return false;
+
EVT SrcVT = ExtVal.getOperand(0).getValueType();
// There is no extending load for vXi1.
@@ -28856,10 +29209,10 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
- unsigned mainDstReg = MRI.createVirtualRegister(RC);
- unsigned fallDstReg = MRI.createVirtualRegister(RC);
+ Register mainDstReg = MRI.createVirtualRegister(RC);
+ Register fallDstReg = MRI.createVirtualRegister(RC);
// thisMBB:
// xbegin fallMBB
@@ -28913,7 +29266,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
static_assert(X86::AddrNumOperands == 5,
"VAARG_64 assumes 5 address operands");
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
MachineOperand &Base = MI.getOperand(1);
MachineOperand &Scale = MI.getOperand(2);
MachineOperand &Index = MI.getOperand(3);
@@ -29049,7 +29402,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
assert(OffsetReg != 0);
// Read the reg_save_area address.
- unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
+ Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
.add(Base)
.add(Scale)
@@ -29059,8 +29412,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.setMemRefs(LoadOnlyMMO);
// Zero-extend the offset
- unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
- BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+ Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
.addImm(0)
.addReg(OffsetReg)
.addImm(X86::sub_32bit);
@@ -29071,7 +29424,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.addReg(RegSaveReg);
// Compute the offset for the next argument
- unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+ Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
.addReg(OffsetReg)
.addImm(UseFPOffset ? 16 : 8);
@@ -29096,7 +29449,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
//
// Load the overflow_area address into a register.
- unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
+ Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
.add(Base)
.add(Scale)
@@ -29110,7 +29463,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
if (NeedsAlign) {
// Align the overflow address
assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
- unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
+ Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
// aligned_addr = (addr + (align-1)) & ~(align-1)
BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
@@ -29127,7 +29480,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Compute the next overflow address after this argument.
// (the overflow address should be kept 8-byte aligned)
- unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
+ Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
.addReg(OverflowDestReg)
.addImm(ArgSizeA8);
@@ -29191,7 +29544,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned CountReg = MI.getOperand(0).getReg();
+ Register CountReg = MI.getOperand(0).getReg();
int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
@@ -29273,7 +29626,9 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
static bool isCMOVPseudo(MachineInstr &MI) {
switch (MI.getOpcode()) {
case X86::CMOV_FR32:
+ case X86::CMOV_FR32X:
case X86::CMOV_FR64:
+ case X86::CMOV_FR64X:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
@@ -29326,9 +29681,9 @@ static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
MachineInstrBuilder MIB;
for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
- unsigned DestReg = MIIt->getOperand(0).getReg();
- unsigned Op1Reg = MIIt->getOperand(1).getReg();
- unsigned Op2Reg = MIIt->getOperand(2).getReg();
+ Register DestReg = MIIt->getOperand(0).getReg();
+ Register Op1Reg = MIIt->getOperand(1).getReg();
+ Register Op2Reg = MIIt->getOperand(2).getReg();
// If this CMOV we are generating is the opposite condition from
// the jump we generated, then we have to swap the operands for the
@@ -29486,9 +29841,9 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
// SinkMBB:
// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
- unsigned DestReg = FirstCMOV.getOperand(0).getReg();
- unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
- unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
+ Register DestReg = FirstCMOV.getOperand(0).getReg();
+ Register Op1Reg = FirstCMOV.getOperand(1).getReg();
+ Register Op2Reg = FirstCMOV.getOperand(2).getReg();
MachineInstrBuilder MIB =
BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
.addReg(Op1Reg)
@@ -30006,7 +30361,7 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
// call the retpoline thunk.
DebugLoc DL = MI.getDebugLoc();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
- unsigned CalleeVReg = MI.getOperand(0).getReg();
+ Register CalleeVReg = MI.getOperand(0).getReg();
unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
// Find an available scratch register to hold the callee. On 64-bit, we can
@@ -30079,7 +30434,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
// Initialize a register with zero.
MVT PVT = getPointerTy(MF->getDataLayout());
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
- unsigned ZReg = MRI.createVirtualRegister(PtrRC);
+ Register ZReg = MRI.createVirtualRegister(PtrRC);
unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
.addDef(ZReg)
@@ -30087,7 +30442,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
.addReg(ZReg, RegState::Undef);
// Read the current SSP Register value to the zeroed register.
- unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
+ Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
@@ -30131,8 +30486,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
(void)TRI;
- unsigned mainDstReg = MRI.createVirtualRegister(RC);
- unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+ Register mainDstReg = MRI.createVirtualRegister(RC);
+ Register restoreDstReg = MRI.createVirtualRegister(RC);
MemOpndSlot = CurOp;
@@ -30246,8 +30601,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
X86FI->setRestoreBasePointer(MF);
- unsigned FramePtr = RegInfo->getFrameRegister(*MF);
- unsigned BasePtr = RegInfo->getBaseRegister();
+ Register FramePtr = RegInfo->getFrameRegister(*MF);
+ Register BasePtr = RegInfo->getBaseRegister();
unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
FramePtr, true, X86FI->getRestoreBasePointerOffset())
@@ -30329,7 +30684,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
MBB->addSuccessor(checkSspMBB);
// Initialize a register with zero.
- unsigned ZReg = MRI.createVirtualRegister(PtrRC);
+ Register ZReg = MRI.createVirtualRegister(PtrRC);
unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
.addDef(ZReg)
@@ -30337,7 +30692,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
.addReg(ZReg, RegState::Undef);
// Read the current SSP Register value to the zeroed register.
- unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
+ Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
@@ -30352,7 +30707,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
checkSspMBB->addSuccessor(fallMBB);
// Reload the previously saved SSP register value.
- unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
+ Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
const int64_t SPPOffset = 3 * PVT.getStoreSize();
MachineInstrBuilder MIB =
@@ -30370,7 +30725,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
MIB.setMemRefs(MMOs);
// Subtract the current SSP from the previous SSP.
- unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
+ Register SspSubReg = MRI.createVirtualRegister(PtrRC);
unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
.addReg(PrevSSPReg)
@@ -30384,7 +30739,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
- unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
+ Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
.addReg(SspSubReg)
.addImm(Offset);
@@ -30394,7 +30749,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
// Reset the lower 8 bits.
- unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
+ Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
.addReg(SspFirstShrReg)
.addImm(8);
@@ -30406,12 +30761,12 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
// Do a single shift left.
unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
- unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
+ Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
.addReg(SspSecondShrReg);
// Save the value 128 to a register (will be used next with incssp).
- unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
+ Register Value128InReg = MRI.createVirtualRegister(PtrRC);
unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
.addImm(128);
@@ -30419,8 +30774,8 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
// Since incssp only looks at the lower 8 bits, we might need to do several
// iterations of incssp until we finish fixing the shadow stack.
- unsigned DecReg = MRI.createVirtualRegister(PtrRC);
- unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
+ Register DecReg = MRI.createVirtualRegister(PtrRC);
+ Register CounterReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
.addReg(SspAfterShlReg)
.addMBB(fixShadowLoopPrepareMBB)
@@ -30460,11 +30815,11 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
const TargetRegisterClass *RC =
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
- unsigned Tmp = MRI.createVirtualRegister(RC);
+ Register Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
- unsigned SP = RegInfo->getStackRegister();
+ Register SP = RegInfo->getStackRegister();
MachineInstrBuilder MIB;
@@ -30662,8 +31017,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
MFI->setRestoreBasePointer(MF);
- unsigned FP = RI.getFrameRegister(*MF);
- unsigned BP = RI.getBaseRegister();
+ Register FP = RI.getFrameRegister(*MF);
+ Register BP = RI.getBaseRegister();
unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
MFI->getRestoreBasePointerOffset())
@@ -30674,7 +31029,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
}
// IReg is used as an index in a memory operand and therefore can't be SP
- unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
+ Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
Subtarget.is64Bit() ? 8 : 4);
BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
@@ -30683,8 +31038,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
if (Subtarget.is64Bit()) {
- unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
- unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+ Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
// leaq .LJTI0_0(%rip), BReg
BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
@@ -30710,9 +31065,9 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addReg(0);
break;
case MachineJumpTableInfo::EK_LabelDifference32: {
- unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
- unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
- unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
// movl (BReg,IReg64,4), OReg
BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
@@ -30783,8 +31138,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
DefRegs[MOp.getReg()] = true;
MachineInstrBuilder MIB(*MF, &II);
- for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
- unsigned Reg = SavedRegs[RI];
+ for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
+ unsigned Reg = SavedRegs[RegIdx];
if (!DefRegs[Reg])
MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
}
@@ -30906,20 +31261,18 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
// Load the old value of the control word...
- unsigned OldCW =
- MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+ Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
OrigCWFrameIdx);
// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
- unsigned NewCW =
- MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+ Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
.addReg(OldCW, RegState::Kill).addImm(0xC00);
// Extract to 16 bits.
- unsigned NewCW16 =
- MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+ Register NewCW16 =
+ MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
.addReg(NewCW, RegState::Kill, X86::sub_16bit);
@@ -31023,7 +31376,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineRegisterInfo &MRI = MF->getRegInfo();
MVT SPTy = getPointerTy(MF->getDataLayout());
const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
- unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
+ Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
X86AddressMode AM = getAddressFromInstr(&MI, 0);
// Regalloc does not need any help when the memory operand of CMPXCHG8B
@@ -31034,10 +31387,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
// four operand definitions that are E[ABCD] registers. We skip them and
// then insert the LEA.
- MachineBasicBlock::iterator MBBI(MI);
- while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
- MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
- --MBBI;
+ MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
+ while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
+ RMBBI->definesRegister(X86::EBX) ||
+ RMBBI->definesRegister(X86::ECX) ||
+ RMBBI->definesRegister(X86::EDX))) {
+ ++RMBBI;
+ }
+ MachineBasicBlock::iterator MBBI(RMBBI);
addFullAddress(
BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
@@ -31232,12 +31589,21 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.One |= Known2.One;
break;
}
+ case X86ISD::PSADBW: {
+ assert(VT.getScalarType() == MVT::i64 &&
+ Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
+ "Unexpected PSADBW types");
+
+ // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
+ Known.Zero.setBitsFrom(16);
+ break;
+ }
case X86ISD::CMOV: {
- Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
+ Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
// If we don't know any bits, early out.
if (Known.isUnknown())
break;
- KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
+ KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
// Only known if known in both the LHS and RHS.
Known.One &= Known2.One;
@@ -31650,8 +32016,8 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
- ArrayRef<int> LoMask(Mask.data() + 0, 4);
- ArrayRef<int> HiMask(Mask.data() + 4, 4);
+ ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
+ ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
// PSHUFLW: permute lower 4 elements only.
if (isUndefOrInRange(LoMask, 0, 4) &&
@@ -31789,8 +32155,8 @@ static bool matchBinaryPermuteShuffle(
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
- if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
- BlendMask)) {
+ if (matchVectorShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
+ ForceV2Zero, BlendMask)) {
if (MaskVT == MVT::v16i16) {
// We can only use v16i16 PBLENDW if the lanes are repeated.
SmallVector<int, 8> RepeatedMask;
@@ -31819,15 +32185,15 @@ static bool matchBinaryPermuteShuffle(
}
}
- // Attempt to combine to INSERTPS.
+ // Attempt to combine to INSERTPS, but only if it has elements that need to
+ // be set to zero.
if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
- MaskVT.is128BitVector()) {
- if (Zeroable.getBoolValue() &&
- matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
- Shuffle = X86ISD::INSERTPS;
- ShuffleVT = MVT::v4f32;
- return true;
- }
+ MaskVT.is128BitVector() &&
+ llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) &&
+ matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+ Shuffle = X86ISD::INSERTPS;
+ ShuffleVT = MVT::v4f32;
+ return true;
}
// Attempt to combine to SHUFPD.
@@ -31835,7 +32201,11 @@ static bool matchBinaryPermuteShuffle(
((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
- if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
+ bool ForceV1Zero = false, ForceV2Zero = false;
+ if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
+ PermuteImm, Mask, Zeroable)) {
+ V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+ V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
Shuffle = X86ISD::SHUFP;
ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
return true;
@@ -31889,6 +32259,15 @@ static bool matchBinaryPermuteShuffle(
}
}
+ // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
+ if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
+ MaskVT.is128BitVector() &&
+ matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+ Shuffle = X86ISD::INSERTPS;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+
return false;
}
@@ -31942,7 +32321,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned NumRootElts = RootVT.getVectorNumElements();
unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
- (RootVT.isFloatingPoint() && Depth >= 2) ||
+ (RootVT.isFloatingPoint() && Depth >= 1) ||
(RootVT.is256BitVector() && !Subtarget.hasAVX2());
// Don't combine if we are a AVX512/EVEX target and the mask element size
@@ -31981,7 +32360,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
!(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
- if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
+ if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
return SDValue(); // Nothing to do!
MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
unsigned PermMask = 0;
@@ -31991,7 +32370,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
Res = DAG.getBitcast(ShuffleVT, V1);
Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
DAG.getUNDEF(ShuffleVT),
- DAG.getConstant(PermMask, DL, MVT::i8));
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
@@ -32026,8 +32405,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Which shuffle domains are permitted?
// Permit domain crossing at higher combine depths.
// TODO: Should we indicate which domain is preferred if both are allowed?
- bool AllowFloatDomain = FloatDomain || (Depth > 3);
- bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
+ bool AllowFloatDomain = FloatDomain || (Depth >= 3);
+ bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
(!MaskVT.is256BitVector() || Subtarget.hasAVX2());
// Determine zeroable mask elements.
@@ -32062,14 +32441,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (V1.getValueType() == MaskVT &&
V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
MayFoldLoad(V1.getOperand(0))) {
- if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
+ if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
return SDValue(); // Nothing to do!
Res = V1.getOperand(0);
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
return DAG.getBitcast(RootVT, Res);
}
if (Subtarget.hasAVX2()) {
- if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
+ if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(MaskVT, V1);
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
@@ -32083,7 +32462,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
- if (Depth == 1 && Root.getOpcode() == Shuffle)
+ if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
@@ -32094,11 +32473,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
- if (Depth == 1 && Root.getOpcode() == Shuffle)
+ if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, V1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
- DAG.getConstant(PermuteImm, DL, MVT::i8));
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
}
@@ -32109,7 +32488,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT, UnaryShuffle) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
- if (Depth == 1 && Root.getOpcode() == Shuffle)
+ if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
@@ -32123,12 +32502,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
- if (Depth == 1 && Root.getOpcode() == Shuffle)
+ if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
- DAG.getConstant(PermuteImm, DL, MVT::i8));
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
@@ -32141,34 +32520,34 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
uint64_t BitLen, BitIdx;
if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
Zeroable)) {
- if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
+ if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
- DAG.getConstant(BitLen, DL, MVT::i8),
- DAG.getConstant(BitIdx, DL, MVT::i8));
+ DAG.getTargetConstant(BitLen, DL, MVT::i8),
+ DAG.getTargetConstant(BitIdx, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
- if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
+ if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
V2 = DAG.getBitcast(IntMaskVT, V2);
Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
- DAG.getConstant(BitLen, DL, MVT::i8),
- DAG.getConstant(BitIdx, DL, MVT::i8));
+ DAG.getTargetConstant(BitLen, DL, MVT::i8),
+ DAG.getTargetConstant(BitIdx, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
}
// Don't try to re-form single instruction chains under any circumstances now
// that we've done encoding canonicalization for them.
- if (Depth < 2)
+ if (Depth < 1)
return SDValue();
// Depth threshold above which we can efficiently use variable mask shuffles.
- int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
+ int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
bool MaskContainsZeros =
@@ -32321,7 +32700,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
V2 = DAG.getBitcast(MaskVT, V2);
SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
- DAG.getConstant(M2ZImm, DL, MVT::i8));
+ DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
@@ -32650,7 +33029,7 @@ static SDValue combineX86ShufflesRecursively(
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
const unsigned MaxRecursionDepth = 8;
- if (Depth > MaxRecursionDepth)
+ if (Depth >= MaxRecursionDepth)
return SDValue();
// Directly rip through bitcasts to find the underlying operand.
@@ -32667,11 +33046,18 @@ static SDValue combineX86ShufflesRecursively(
"Can only combine shuffles of the same vector register size.");
// Extract target shuffle mask and resolve sentinels and inputs.
+ // TODO - determine Op's demanded elts from RootMask.
SmallVector<int, 64> OpMask;
SmallVector<SDValue, 2> OpInputs;
- if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
+ APInt OpUndef, OpZero;
+ APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+ bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
+ if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
+ OpZero, DAG, Depth, false))
return SDValue();
+ resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
+
// Add the inputs to the Ops list, avoiding duplicates.
SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
@@ -32772,6 +33158,9 @@ static SDValue combineX86ShufflesRecursively(
Mask[i] = OpMaskedIdx;
}
+ // Remove unused/repeated shuffle source ops.
+ resolveTargetShuffleInputsAndMask(Ops, Mask);
+
// Handle the all undef/zero cases early.
if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
return DAG.getUNDEF(Root.getValueType());
@@ -32783,11 +33172,8 @@ static SDValue combineX86ShufflesRecursively(
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
SDLoc(Root));
- // Remove unused/repeated shuffle source ops.
- resolveTargetShuffleInputsAndMask(Ops, Mask);
assert(!Ops.empty() && "Shuffle with no inputs detected");
-
- HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
+ HasVariableMask |= IsOpVariableMask;
// Update the list of shuffle nodes that have been combined so far.
SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
@@ -32853,7 +33239,7 @@ static SDValue combineX86ShufflesRecursively(
/// Helper entry wrapper to combineX86ShufflesRecursively.
static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
/*HasVarMask*/ false,
/*AllowVarMask*/ true, DAG, Subtarget);
}
@@ -33088,7 +33474,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
for (unsigned i = 0; i != Scale; ++i)
DemandedMask[i] = i;
if (SDValue Res = combineX86ShufflesRecursively(
- {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
+ {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getBitcast(SrcVT, Res));
@@ -33120,6 +33506,30 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
VT.getSizeInBits());
}
+ // vbroadcast(scalarload X) -> vbroadcast_load X
+ // For float loads, extract other uses of the scalar from the broadcast.
+ if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
+ ISD::isNormalLoad(Src.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ // If the load value is used only by N, replace it via CombineTo N.
+ bool NoReplaceExtract = Src.hasOneUse();
+ DCI.CombineTo(N.getNode(), BcastLd);
+ if (NoReplaceExtract) {
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ } else {
+ SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
+ DAG.getIntPtrConstant(0, DL));
+ DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
+ }
+ return N; // Return N so it doesn't get rechecked!
+ }
+
return SDValue();
}
case X86ISD::BLENDI: {
@@ -33133,14 +33543,14 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
MVT SrcVT = N0.getOperand(0).getSimpleValueType();
if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
SrcVT.getScalarSizeInBits() >= 32) {
- unsigned Mask = N.getConstantOperandVal(2);
+ unsigned BlendMask = N.getConstantOperandVal(2);
unsigned Size = VT.getVectorNumElements();
unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
- unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale);
+ BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
N1.getOperand(0),
- DAG.getConstant(ScaleMask, DL, MVT::i8)));
+ DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
}
}
return SDValue();
@@ -33208,76 +33618,97 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
// If we zero out all elements from Op0 then we don't need to reference it.
if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
// If we zero out the element from Op1 then we don't need to reference it.
if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
// Attempt to merge insertps Op1 with an inner target shuffle node.
SmallVector<int, 8> TargetMask1;
SmallVector<SDValue, 2> Ops1;
- if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
- int M = TargetMask1[SrcIdx];
- if (isUndefOrZero(M)) {
+ APInt KnownUndef1, KnownZero1;
+ if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
+ KnownZero1)) {
+ if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
// Zero/UNDEF insertion - zero out element and remove dependency.
InsertPSMask |= (1u << DstIdx);
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
}
// Update insertps mask srcidx and reference the source input directly.
+ int M = TargetMask1[SrcIdx];
assert(0 <= M && M < 8 && "Shuffle index out of range");
InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
Op1 = Ops1[M < 4 ? 0 : 1];
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
}
// Attempt to merge insertps Op0 with an inner target shuffle node.
SmallVector<int, 8> TargetMask0;
SmallVector<SDValue, 2> Ops0;
- if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
- return SDValue();
+ APInt KnownUndef0, KnownZero0;
+ if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
+ KnownZero0)) {
+ bool Updated = false;
+ bool UseInput00 = false;
+ bool UseInput01 = false;
+ for (int i = 0; i != 4; ++i) {
+ if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
+ // No change if element is already zero or the inserted element.
+ continue;
+ } else if (KnownUndef0[i] || KnownZero0[i]) {
+ // If the target mask is undef/zero then we must zero the element.
+ InsertPSMask |= (1u << i);
+ Updated = true;
+ continue;
+ }
- bool Updated = false;
- bool UseInput00 = false;
- bool UseInput01 = false;
- for (int i = 0; i != 4; ++i) {
- int M = TargetMask0[i];
- if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
- // No change if element is already zero or the inserted element.
- continue;
- } else if (isUndefOrZero(M)) {
- // If the target mask is undef/zero then we must zero the element.
- InsertPSMask |= (1u << i);
- Updated = true;
- continue;
+ // The input vector element must be inline.
+ int M = TargetMask0[i];
+ if (M != i && M != (i + 4))
+ return SDValue();
+
+ // Determine which inputs of the target shuffle we're using.
+ UseInput00 |= (0 <= M && M < 4);
+ UseInput01 |= (4 <= M);
}
- // The input vector element must be inline.
- if (M != i && M != (i + 4))
- return SDValue();
+ // If we're not using both inputs of the target shuffle then use the
+ // referenced input directly.
+ if (UseInput00 && !UseInput01) {
+ Updated = true;
+ Op0 = Ops0[0];
+ } else if (!UseInput00 && UseInput01) {
+ Updated = true;
+ Op0 = Ops0[1];
+ }
- // Determine which inputs of the target shuffle we're using.
- UseInput00 |= (0 <= M && M < 4);
- UseInput01 |= (4 <= M);
+ if (Updated)
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
}
- // If we're not using both inputs of the target shuffle then use the
- // referenced input directly.
- if (UseInput00 && !UseInput01) {
- Updated = true;
- Op0 = Ops0[0];
- } else if (!UseInput00 && UseInput01) {
- Updated = true;
- Op0 = Ops0[1];
+ // If we're inserting an element from a vbroadcast load, fold the
+ // load into the X86insertps instruction. We need to convert the scalar
+ // load to a vector and clear the source lane of the INSERTPS control.
+ if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
+ if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
+ SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
+ MemIntr->getBasePtr(),
+ MemIntr->getMemOperand());
+ SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
+ Load),
+ DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
+ return Insert;
+ }
}
- if (Updated)
- return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
-
return SDValue();
}
default:
@@ -33580,7 +34011,7 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
}
/// Eliminate a redundant shuffle of a horizontal math op.
-static SDValue foldShuffleOfHorizOp(SDNode *N) {
+static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
unsigned Opcode = N->getOpcode();
if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
@@ -33611,17 +34042,36 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
HOp.getOperand(0) != HOp.getOperand(1))
return SDValue();
+ // The shuffle that we are eliminating may have allowed the horizontal op to
+ // have an undemanded (undefined) operand. Duplicate the other (defined)
+ // operand to ensure that the results are defined across all lanes without the
+ // shuffle.
+ auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
+ SDValue X;
+ if (HorizOp.getOperand(0).isUndef()) {
+ assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op");
+ X = HorizOp.getOperand(1);
+ } else if (HorizOp.getOperand(1).isUndef()) {
+ assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op");
+ X = HorizOp.getOperand(0);
+ } else {
+ return HorizOp;
+ }
+ return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
+ HorizOp.getValueType(), X, X);
+ };
+
// When the operands of a horizontal math op are identical, the low half of
// the result is the same as the high half. If a target shuffle is also
- // replicating low and high halves, we don't need the shuffle.
+ // replicating low and high halves (and without changing the type/length of
+ // the vector), we don't need the shuffle.
if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
- if (HOp.getScalarValueSizeInBits() == 64) {
+ if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
// movddup (hadd X, X) --> hadd X, X
// broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
assert((HOp.getValueType() == MVT::v2f64 ||
- HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT &&
- "Unexpected type for h-op");
- return HOp;
+ HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op");
+ return updateHOp(HOp, DAG);
}
return SDValue();
}
@@ -33635,14 +34085,14 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
(isTargetShuffleEquivalent(Mask, {0, 0}) ||
isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
- return HOp;
+ return updateHOp(HOp, DAG);
if (HOp.getValueSizeInBits() == 256 &&
(isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
isTargetShuffleEquivalent(
Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
- return HOp;
+ return updateHOp(HOp, DAG);
return SDValue();
}
@@ -33677,7 +34127,7 @@ static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
// the wide shuffle that we started with.
return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
Shuf->getOperand(1), HalfMask, HalfIdx1,
- HalfIdx2, false, DAG);
+ HalfIdx2, false, DAG, /*UseConcat*/true);
}
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
@@ -33696,70 +34146,10 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
- if (SDValue HAddSub = foldShuffleOfHorizOp(N))
+ if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
return HAddSub;
}
- // During Type Legalization, when promoting illegal vector types,
- // the backend might introduce new shuffle dag nodes and bitcasts.
- //
- // This code performs the following transformation:
- // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
- // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
- //
- // We do this only if both the bitcast and the BINOP dag nodes have
- // one use. Also, perform this transformation only if the new binary
- // operation is legal. This is to avoid introducing dag nodes that
- // potentially need to be further expanded (or custom lowered) into a
- // less optimal sequence of dag nodes.
- if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
- N->getOpcode() == ISD::VECTOR_SHUFFLE &&
- N->getOperand(0).getOpcode() == ISD::BITCAST &&
- N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
-
- SDValue BC0 = N0.getOperand(0);
- EVT SVT = BC0.getValueType();
- unsigned Opcode = BC0.getOpcode();
- unsigned NumElts = VT.getVectorNumElements();
-
- if (BC0.hasOneUse() && SVT.isVector() &&
- SVT.getVectorNumElements() * 2 == NumElts &&
- TLI.isOperationLegal(Opcode, VT)) {
- bool CanFold = false;
- switch (Opcode) {
- default : break;
- case ISD::ADD:
- case ISD::SUB:
- case ISD::MUL:
- // isOperationLegal lies for integer ops on floating point types.
- CanFold = VT.isInteger();
- break;
- case ISD::FADD:
- case ISD::FSUB:
- case ISD::FMUL:
- // isOperationLegal lies for floating point ops on integer types.
- CanFold = VT.isFloatingPoint();
- break;
- }
-
- unsigned SVTNumElts = SVT.getVectorNumElements();
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
- CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
- for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
- CanFold = SVOp->getMaskElt(i) < 0;
-
- if (CanFold) {
- SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
- SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
- SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
- return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
- }
- }
- }
-
// Attempt to combine into a vector load/broadcast.
if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
return LD;
@@ -33841,7 +34231,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
ISD::isNormalLoad(N->getOperand(0).getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
- if (!LN->isVolatile()) {
+ if (LN->isSimple()) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
SDValue VZLoad =
@@ -33855,53 +34245,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
}
}
-
- // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
- // operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
- // FIXME: This can probably go away once we default to widening legalization.
- if (Subtarget.hasSSE41() && VT == MVT::v4i32 &&
- N->getOpcode() == ISD::VECTOR_SHUFFLE &&
- N->getOperand(0).getOpcode() == ISD::BITCAST &&
- N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) {
- SDValue BC = N->getOperand(0);
- SDValue MULUDQ = BC.getOperand(0);
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- ArrayRef<int> Mask = SVOp->getMask();
- if (BC.hasOneUse() && MULUDQ.hasOneUse() &&
- Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) {
- SDValue Op0 = MULUDQ.getOperand(0);
- SDValue Op1 = MULUDQ.getOperand(1);
- if (Op0.getOpcode() == ISD::BITCAST &&
- Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
- Op0.getOperand(0).getValueType() == MVT::v4i32) {
- ShuffleVectorSDNode *SVOp0 =
- cast<ShuffleVectorSDNode>(Op0.getOperand(0));
- ArrayRef<int> Mask2 = SVOp0->getMask();
- if (Mask2[0] == 0 && Mask2[1] == -1 &&
- Mask2[2] == 1 && Mask2[3] == -1) {
- Op0 = SVOp0->getOperand(0);
- Op1 = DAG.getBitcast(MVT::v4i32, Op1);
- Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask);
- return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
- }
- }
- if (Op1.getOpcode() == ISD::BITCAST &&
- Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
- Op1.getOperand(0).getValueType() == MVT::v4i32) {
- ShuffleVectorSDNode *SVOp1 =
- cast<ShuffleVectorSDNode>(Op1.getOperand(0));
- ArrayRef<int> Mask2 = SVOp1->getMask();
- if (Mask2[0] == 0 && Mask2[1] == -1 &&
- Mask2[2] == 1 && Mask2[3] == -1) {
- Op0 = DAG.getBitcast(MVT::v4i32, Op0);
- Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask);
- Op1 = SVOp1->getOperand(0);
- return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
- }
- }
- }
- }
-
return SDValue();
}
@@ -33966,6 +34309,84 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// TODO convert SrcUndef to KnownUndef.
break;
}
+ case X86ISD::KSHIFTL: {
+ SDValue Src = Op.getOperand(0);
+ auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
+ assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
+ unsigned ShiftAmt = Amt->getZExtValue();
+
+ if (ShiftAmt == 0)
+ return TLO.CombineTo(Op, Src);
+
+ // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
+ // single shift. We can do this if the bottom bits (which are shifted
+ // out) are never demanded.
+ if (Src.getOpcode() == X86ISD::KSHIFTR) {
+ if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
+ unsigned C1 = Src.getConstantOperandVal(1);
+ unsigned NewOpc = X86ISD::KSHIFTL;
+ int Diff = ShiftAmt - C1;
+ if (Diff < 0) {
+ Diff = -Diff;
+ NewOpc = X86ISD::KSHIFTR;
+ }
+
+ SDLoc dl(Op);
+ SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
+ }
+ }
+
+ APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
+ Depth + 1))
+ return true;
+
+ KnownUndef <<= ShiftAmt;
+ KnownZero <<= ShiftAmt;
+ KnownZero.setLowBits(ShiftAmt);
+ break;
+ }
+ case X86ISD::KSHIFTR: {
+ SDValue Src = Op.getOperand(0);
+ auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
+ assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
+ unsigned ShiftAmt = Amt->getZExtValue();
+
+ if (ShiftAmt == 0)
+ return TLO.CombineTo(Op, Src);
+
+ // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
+ // single shift. We can do this if the top bits (which are shifted
+ // out) are never demanded.
+ if (Src.getOpcode() == X86ISD::KSHIFTL) {
+ if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
+ unsigned C1 = Src.getConstantOperandVal(1);
+ unsigned NewOpc = X86ISD::KSHIFTR;
+ int Diff = ShiftAmt - C1;
+ if (Diff < 0) {
+ Diff = -Diff;
+ NewOpc = X86ISD::KSHIFTL;
+ }
+
+ SDLoc dl(Op);
+ SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
+ }
+ }
+
+ APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
+ Depth + 1))
+ return true;
+
+ KnownUndef.lshrInPlace(ShiftAmt);
+ KnownZero.lshrInPlace(ShiftAmt);
+ KnownZero.setHighBits(ShiftAmt);
+ break;
+ }
case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P: {
SDValue Src = Op.getOperand(0);
@@ -33979,16 +34400,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
case X86ISD::PACKSS:
case X86ISD::PACKUS: {
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
APInt SrcUndef, SrcZero;
- if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
- SrcZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
+ Depth + 1))
return true;
- if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef,
- SrcZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
+ Depth + 1))
return true;
+
+ // Aggressively peek through ops to get at the demanded elts.
+ // TODO - we should do this for all target/faux shuffles ops.
+ if (!DemandedElts.isAllOnesValue()) {
+ APInt DemandedSrcBits =
+ APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
+ SDValue NewN0 = SimplifyMultipleUseDemandedBits(
+ N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1);
+ SDValue NewN1 = SimplifyMultipleUseDemandedBits(
+ N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1);
+ if (NewN0 || NewN1) {
+ NewN0 = NewN0 ? NewN0 : N0;
+ NewN1 = NewN1 ? NewN1 : N1;
+ return TLO.CombineTo(Op,
+ TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
+ }
+ }
break;
}
case X86ISD::HADD:
@@ -34062,25 +34503,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
break;
}
- case X86ISD::SUBV_BROADCAST: {
- // Reduce size of broadcast if we don't need the upper half.
- unsigned HalfElts = NumElts / 2;
- if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) {
- SDValue Src = Op.getOperand(0);
- MVT SrcVT = Src.getSimpleValueType();
-
- SDValue Half = Src;
- if (SrcVT.getVectorNumElements() != HalfElts) {
- MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts);
- Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src);
- }
-
- return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0,
- TLO.DAG, SDLoc(Op),
- Half.getValueSizeInBits()));
- }
- break;
- }
case X86ISD::VPERMV: {
SDValue Mask = Op.getOperand(0);
APInt MaskUndef, MaskZero;
@@ -34135,6 +34557,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
return TLO.CombineTo(Op, Insert);
}
+ // Subvector broadcast.
+ case X86ISD::SUBV_BROADCAST: {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+ if (Src.getValueSizeInBits() > ExtSizeInBits)
+ Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
+ else if (Src.getValueSizeInBits() < ExtSizeInBits) {
+ MVT SrcSVT = Src.getSimpleValueType().getScalarType();
+ MVT SrcVT =
+ MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
+ Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
+ }
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ }
// Byte shifts by immediate.
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
@@ -34201,36 +34638,30 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
}
- // Simplify target shuffles.
- if (!isTargetShuffle(Opc) || !VT.isSimple())
- return false;
-
- // Get target shuffle mask.
- bool IsUnary;
+ // Get target/faux shuffle mask.
+ APInt OpUndef, OpZero;
SmallVector<int, 64> OpMask;
SmallVector<SDValue, 2> OpInputs;
- if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs,
- OpMask, IsUnary))
+ if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
+ OpZero, TLO.DAG, Depth, false))
return false;
- // Shuffle inputs must be the same type as the result.
- if (llvm::any_of(OpInputs,
- [VT](SDValue V) { return VT != V.getValueType(); }))
+ // Shuffle inputs must be the same size as the result.
+ if (OpMask.size() != (unsigned)NumElts ||
+ llvm::any_of(OpInputs, [VT](SDValue V) {
+ return VT.getSizeInBits() != V.getValueSizeInBits() ||
+ !V.getValueType().isVector();
+ }))
return false;
- // Clear known elts that might have been set above.
- KnownZero.clearAllBits();
- KnownUndef.clearAllBits();
+ KnownZero = OpZero;
+ KnownUndef = OpUndef;
// Check if shuffle mask can be simplified to undef/zero/identity.
int NumSrcs = OpInputs.size();
- for (int i = 0; i != NumElts; ++i) {
- int &M = OpMask[i];
+ for (int i = 0; i != NumElts; ++i)
if (!DemandedElts[i])
- M = SM_SentinelUndef;
- else if (0 <= M && OpInputs[M / NumElts].isUndef())
- M = SM_SentinelUndef;
- }
+ OpMask[i] = SM_SentinelUndef;
if (isUndefInRange(OpMask, 0, NumElts)) {
KnownUndef.setAllBits();
@@ -34243,10 +34674,14 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
for (int Src = 0; Src != NumSrcs; ++Src)
if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
- return TLO.CombineTo(Op, OpInputs[Src]);
+ return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
// Attempt to simplify inputs.
for (int Src = 0; Src != NumSrcs; ++Src) {
+ // TODO: Support inputs of different types.
+ if (OpInputs[Src].getValueType() != VT)
+ continue;
+
int Lo = Src * NumElts;
APInt SrcElts = APInt::getNullValue(NumElts);
for (int i = 0; i != NumElts; ++i)
@@ -34256,21 +34691,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SrcElts.setBit(M);
}
+ // TODO - Propagate input undef/zero elts.
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
TLO, Depth + 1))
return true;
}
- // Extract known zero/undef elements.
- // TODO - Propagate input undef/zero elts.
- for (int i = 0; i != NumElts; ++i) {
- if (OpMask[i] == SM_SentinelUndef)
- KnownUndef.setBit(i);
- if (OpMask[i] == SM_SentinelZero)
- KnownZero.setBit(i);
- }
-
return false;
}
@@ -34296,6 +34723,18 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
TLO, Depth + 1))
return true;
+
+ // Aggressively peek through ops to get at the demanded low bits.
+ SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
+ LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+ SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
+ RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+ if (DemandedLHS || DemandedRHS) {
+ DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
+ DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
+ }
break;
}
case X86ISD::VSHLI: {
@@ -34323,7 +34762,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
SDValue NewShift = TLO.DAG.getNode(
NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
- TLO.DAG.getConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
+ TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
return TLO.CombineTo(Op, NewShift);
}
}
@@ -34441,6 +34880,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
KnownVec, TLO, Depth + 1))
return true;
+ if (SDValue V = SimplifyMultipleUseDemandedBits(
+ Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
+
Known = KnownVec.zext(BitWidth, true);
return false;
}
@@ -34542,12 +34986,80 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
}
+SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ SelectionDAG &DAG, unsigned Depth) const {
+ int NumElts = DemandedElts.getBitWidth();
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+
+ switch (Opc) {
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW: {
+ // If we don't demand the inserted element, return the base vector.
+ SDValue Vec = Op.getOperand(0);
+ auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ MVT VecVT = Vec.getSimpleValueType();
+ if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
+ !DemandedElts[CIdx->getZExtValue()])
+ return Vec;
+ break;
+ }
+ }
+
+ APInt ShuffleUndef, ShuffleZero;
+ SmallVector<int, 16> ShuffleMask;
+ SmallVector<SDValue, 2> ShuffleOps;
+ if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
+ ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
+ // If all the demanded elts are from one operand and are inline,
+ // then we can use the operand directly.
+ int NumOps = ShuffleOps.size();
+ if (ShuffleMask.size() == (unsigned)NumElts &&
+ llvm::all_of(ShuffleOps, [VT](SDValue V) {
+ return VT.getSizeInBits() == V.getValueSizeInBits();
+ })) {
+
+ if (DemandedElts.isSubsetOf(ShuffleUndef))
+ return DAG.getUNDEF(VT);
+ if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
+ return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
+
+ // Bitmask that indicates which ops have only been accessed 'inline'.
+ APInt IdentityOp = APInt::getAllOnesValue(NumOps);
+ for (int i = 0; i != NumElts; ++i) {
+ int M = ShuffleMask[i];
+ if (!DemandedElts[i] || ShuffleUndef[i])
+ continue;
+ int Op = M / NumElts;
+ int Index = M % NumElts;
+ if (M < 0 || Index != i) {
+ IdentityOp.clearAllBits();
+ break;
+ }
+ IdentityOp &= APInt::getOneBitSet(NumOps, Op);
+ if (IdentityOp == 0)
+ break;
+ }
+ assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
+ "Multiple identity shuffles detected");
+
+ if (IdentityOp != 0)
+ return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
+ }
+ }
+
+ return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
+ Op, DemandedBits, DemandedElts, DAG, Depth);
+}
+
/// Check if a vector extract from a target-specific shuffle of a load can be
/// folded into a single element load.
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
/// shuffles have been custom lowered so we need to handle those here.
-static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue
+XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -34559,13 +35071,17 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT OriginalVT = InVec.getValueType();
+ unsigned NumOriginalElts = OriginalVT.getVectorNumElements();
// Peek through bitcasts, don't duplicate a load with other uses.
InVec = peekThroughOneUseBitcasts(InVec);
EVT CurrentVT = InVec.getValueType();
- if (!CurrentVT.isVector() ||
- CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+ if (!CurrentVT.isVector())
+ return SDValue();
+
+ unsigned NumCurrentElts = CurrentVT.getVectorNumElements();
+ if ((NumOriginalElts % NumCurrentElts) != 0)
return SDValue();
if (!isTargetShuffle(InVec.getOpcode()))
@@ -34582,10 +35098,17 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
ShuffleOps, ShuffleMask, UnaryShuffle))
return SDValue();
+ unsigned Scale = NumOriginalElts / NumCurrentElts;
+ if (Scale > 1) {
+ SmallVector<int, 16> ScaledMask;
+ scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask);
+ ShuffleMask = std::move(ScaledMask);
+ }
+ assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch");
+
// Select the input vector, guarding against out of range extract vector.
- unsigned NumElems = CurrentVT.getVectorNumElements();
int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
- int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
+ int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt];
if (Idx == SM_SentinelZero)
return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
@@ -34598,8 +35121,9 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
return SDValue();
- assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
- SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1];
+ assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) &&
+ "Shuffle index out of range");
+ SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1];
// If inputs to shuffle are the same for both ops, then allow 2 uses
unsigned AllowedUses =
@@ -34619,7 +35143,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
- if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+ if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple())
return SDValue();
// If there's a bitcast before the shuffle, check if the load type and
@@ -34637,10 +35161,11 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
SDLoc dl(N);
// Create shuffle node taking into account the case that its a unary shuffle
- SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
- Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
- ShuffleMask);
- Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
+ SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT)
+ : DAG.getBitcast(OriginalVT, ShuffleOps[1]);
+ Shuffle = DAG.getVectorShuffle(OriginalVT, dl,
+ DAG.getBitcast(OriginalVT, ShuffleOps[0]),
+ Shuffle, ShuffleMask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
EltNo);
}
@@ -34660,6 +35185,23 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
return false;
}
+// Helper to push sign extension of vXi1 SETCC result through bitops.
+static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
+ SDValue Src, const SDLoc &DL) {
+ switch (Src.getOpcode()) {
+ case ISD::SETCC:
+ return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ return DAG.getNode(
+ Src.getOpcode(), DL, SExtVT,
+ signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
+ signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
+ }
+ llvm_unreachable("Unexpected node type for vXi1 sign extension");
+}
+
// Try to match patterns such as
// (i16 bitcast (v16i1 x))
// ->
@@ -34698,6 +35240,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
MVT SExtVT;
+ bool PropagateSExt = false;
switch (SrcVT.getSimpleVT().SimpleTy) {
default:
return SDValue();
@@ -34708,8 +35251,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
SExtVT = MVT::v4i32;
// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
// sign-extend to a 256-bit operation to avoid truncation.
- if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256))
+ if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
SExtVT = MVT::v4i64;
+ PropagateSExt = true;
+ }
break;
case MVT::v8i1:
SExtVT = MVT::v8i16;
@@ -34718,11 +35263,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
// 256-bit because the shuffle is cheaper than sign extending the result of
// the compare.
- // TODO : use checkBitcastSrcVectorSize
- if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
- (Src.getOperand(0).getValueType().is256BitVector() ||
- Src.getOperand(0).getValueType().is512BitVector())) {
+ if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||
+ checkBitcastSrcVectorSize(Src, 512))) {
SExtVT = MVT::v8i32;
+ PropagateSExt = true;
}
break;
case MVT::v16i1:
@@ -34745,19 +35289,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
return SDValue();
};
- SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+ SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
+ : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
- if (SExtVT == MVT::v64i8) {
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
- Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
- Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
- Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
- Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
- Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
- DAG.getConstant(32, DL, MVT::i8));
- V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
- } else if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
+ if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
V = getPMOVMSKB(DL, V, DAG, Subtarget);
} else {
if (SExtVT == MVT::v8i16)
@@ -34891,8 +35426,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
- DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
- DAG.getConstant(ShufMask, DL, MVT::i8));
+ DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32),
+ Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
}
Ops.append(NumElts, Splat);
} else {
@@ -34935,6 +35470,24 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
return V;
+ // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type
+ // legalization destroys the v4i32 type.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&
+ VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&
+ N0.getOperand(0).getValueType() == MVT::v4i32 &&
+ ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&
+ cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {
+ SDValue N00 = N0.getOperand(0);
+ // Only do this if we can avoid scalarizing the input.
+ if (ISD::isNormalLoad(N00.getNode()) ||
+ (N00.getOpcode() == ISD::BITCAST &&
+ N00.getOperand(0).getValueType() == MVT::v4f32)) {
+ SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4f32, N00));
+ return DAG.getZExtOrTrunc(V, dl, VT);
+ }
+ }
+
// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
// type, widen both sides to avoid a trip through memory.
if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
@@ -34949,6 +35502,26 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// type, widen both sides to avoid a trip through memory.
if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
Subtarget.hasAVX512()) {
+ // Use zeros for the widening if we already have some zeroes. This can
+ // allow SimplifyDemandedBits to remove scalar ANDs that may be down
+ // stream of this.
+ // FIXME: It might make sense to detect a concat_vectors with a mix of
+ // zeroes and undef and turn it into insert_subvector for i1 vectors as
+ // a separate combine. What we can't do is canonicalize the operands of
+ // such a concat or we'll get into a loop with SimplifyDemandedBits.
+ if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
+ SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
+ if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
+ SrcVT = LastOp.getValueType();
+ unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
+ SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
+ Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
+ N0 = DAG.getBitcast(MVT::i8, N0);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
+ }
+ }
+
unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
Ops[0] = N0;
@@ -34958,6 +35531,33 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
}
}
+ // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
+ // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
+ // due to insert_subvector legalization on KNL. By promoting the copy to i16
+ // we can help with known bits propagation from the vXi1 domain to the
+ // scalar domain.
+ if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
+ !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N0.getOperand(0).getValueType() == MVT::v16i1 &&
+ isNullConstant(N0.getOperand(1)))
+ return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
+ DAG.getBitcast(MVT::i16, N0.getOperand(0)));
+
+ // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT
+ // determines // the number of bits loaded. Remaining bits are zero.
+ if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
+ VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) {
+ auto *BCast = cast<MemIntrinsicSDNode>(N0);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
+ VT.getVectorElementType(),
+ BCast->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
+ return ResNode;
+ }
+
// Since MMX types are special and don't usually play with other vector types,
// it's better to handle them early to be sure we emit efficient code by
// avoiding store-load conversions.
@@ -35152,7 +35752,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
ISD::NodeType BinOp;
SDValue Src = DAG.matchBinOpReduction(
- Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
+ Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
if (!Src)
return SDValue();
@@ -35246,29 +35846,31 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
SDLoc DL(Extract);
EVT MatchVT = Match.getValueType();
unsigned NumElts = MatchVT.getVectorNumElements();
+ unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (ExtractVT == MVT::i1) {
// Special case for (pre-legalization) vXi1 reductions.
- if (NumElts > 32)
+ if (NumElts > 64 || !isPowerOf2_32(NumElts))
return SDValue();
- if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) {
+ if (TLI.isTypeLegal(MatchVT)) {
// If this is a legal AVX512 predicate type then we can just bitcast.
EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
Movmsk = DAG.getBitcast(MovmskVT, Match);
} else {
// Use combineBitcastvxi1 to create the MOVMSK.
- if (NumElts == 32 && !Subtarget.hasInt256()) {
+ while (NumElts > MaxElts) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
- NumElts = 16;
+ NumElts /= 2;
}
EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
}
if (!Movmsk)
return SDValue();
- Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32);
+ Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
} else {
// Bail with AVX512VL (which uses predicate registers).
if (Subtarget.hasVLX())
@@ -35309,13 +35911,15 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
NumElts = MaskSrcVT.getVectorNumElements();
}
- assert(NumElts <= 32 && "Not expecting more than 32 elements");
+ assert((NumElts <= 32 || NumElts == 64) &&
+ "Not expecting more than 64 elements");
+ MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
if (BinOp == ISD::XOR) {
// parity -> (AND (CTPOP(MOVMSK X)), 1)
- SDValue Mask = DAG.getConstant(1, DL, MVT::i32);
- SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk);
- Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask);
+ SDValue Mask = DAG.getConstant(1, DL, CmpVT);
+ SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk);
+ Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask);
return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
}
@@ -35323,19 +35927,19 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
ISD::CondCode CondCode;
if (BinOp == ISD::OR) {
// any_of -> MOVMSK != 0
- CmpC = DAG.getConstant(0, DL, MVT::i32);
+ CmpC = DAG.getConstant(0, DL, CmpVT);
CondCode = ISD::CondCode::SETNE;
} else {
// all_of -> MOVMSK == ((1 << NumElts) - 1)
- CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32);
+ CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
+ DL, CmpVT);
CondCode = ISD::CondCode::SETEQ;
}
// The setcc produces an i8 of 0/1, so extend that to the result width and
// negate to get the final 0/-1 mask value.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT SetccVT =
- TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
@@ -35431,6 +36035,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ SDLoc dl(N);
SDValue Src = N->getOperand(0);
SDValue Idx = N->getOperand(1);
@@ -35452,10 +36057,37 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
return DAG.getBitcast(VT, SrcOp);
}
+ // If we're extracting a single element from a broadcast load and there are
+ // no other users, just create a single load.
+ if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
+ unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
+ if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
+ VT.getSizeInBits() == SrcBCWidth) {
+ SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
+ MemIntr->getBasePtr(),
+ MemIntr->getPointerInfo(),
+ MemIntr->getAlignment(),
+ MemIntr->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
+ return Load;
+ }
+ }
+
+ // Handle extract(truncate(x)) for 0'th index.
+ // TODO: Treat this as a faux shuffle?
+ // TODO: When can we use this for general indices?
+ if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() &&
+ isNullConstant(Idx)) {
+ Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
+ Src = DAG.getBitcast(SrcVT, Src);
+ return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
+ }
+
// Resolve the target shuffle inputs and mask.
SmallVector<int, 16> Mask;
SmallVector<SDValue, 2> Ops;
- if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
+ if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
return SDValue();
// Attempt to narrow/widen the shuffle mask to the correct size.
@@ -35489,7 +36121,6 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
return SDValue();
int SrcIdx = Mask[N->getConstantOperandVal(1)];
- SDLoc dl(N);
// If the shuffle source element is undef/zero then we can just accept it.
if (SrcIdx == SM_SentinelUndef)
@@ -35584,7 +36215,7 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
}
// TODO: This switch could include FNEG and the x86-specific FP logic ops
- // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
+ // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
// missed load folding and fma+fneg combining.
switch (Vec.getOpcode()) {
case ISD::FMA: // Begin 3 operands
@@ -35631,27 +36262,84 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
- if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
- return SDValue();
- SDValue Index = ExtElt->getOperand(1);
- if (!isNullConstant(Index))
- return SDValue();
- // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
ISD::NodeType Opc;
- SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
+ SDValue Rdx =
+ DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
if (!Rdx)
return SDValue();
+ SDValue Index = ExtElt->getOperand(1);
+ assert(isNullConstant(Index) &&
+ "Reduction doesn't end in an extract from index 0");
+
EVT VT = ExtElt->getValueType(0);
- EVT VecVT = ExtElt->getOperand(0).getValueType();
+ EVT VecVT = Rdx.getValueType();
if (VecVT.getScalarType() != VT)
return SDValue();
- unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
SDLoc DL(ExtElt);
+ // vXi8 reduction - sub 128-bit vector.
+ if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
+ if (VecVT == MVT::v4i8) {
+ // Pad with zero.
+ if (Subtarget.hasSSE41()) {
+ Rdx = DAG.getBitcast(MVT::i32, Rdx);
+ Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
+ DAG.getConstant(0, DL, MVT::v4i32), Rdx,
+ DAG.getIntPtrConstant(0, DL));
+ Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+ } else {
+ Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
+ DAG.getConstant(0, DL, VecVT));
+ }
+ }
+ if (Rdx.getValueType() == MVT::v8i8) {
+ // Pad with undef.
+ Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
+ DAG.getUNDEF(MVT::v8i8));
+ }
+ Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
+ DAG.getConstant(0, DL, MVT::v16i8));
+ Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ }
+
+ // Must be a >=128-bit vector with pow2 elements.
+ if ((VecVT.getSizeInBits() % 128) != 0 ||
+ !isPowerOf2_32(VecVT.getVectorNumElements()))
+ return SDValue();
+
+ // vXi8 reduction - sum lo/hi halves then use PSADBW.
+ if (VT == MVT::i8) {
+ while (Rdx.getValueSizeInBits() > 128) {
+ unsigned HalfSize = VecVT.getSizeInBits() / 2;
+ unsigned HalfElts = VecVT.getVectorNumElements() / 2;
+ SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
+ SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
+ Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
+ VecVT = Rdx.getValueType();
+ }
+ assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
+
+ SDValue Hi = DAG.getVectorShuffle(
+ MVT::v16i8, DL, Rdx, Rdx,
+ {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
+ Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
+ Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
+ getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
+ Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ }
+
+ // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
+ return SDValue();
+
+ unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
+
// 256-bit horizontal instructions operate on 128-bit chunks rather than
// across the whole vector, so we need an extract + hop preliminary stage.
// This is the only step where the operands of the hop are not the same value.
@@ -35661,15 +36349,14 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
unsigned NumElts = VecVT.getVectorNumElements();
SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
- VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
- Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
+ Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
+ VecVT = Rdx.getValueType();
}
if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
return SDValue();
// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
- assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
for (unsigned i = 0; i != ReductionSteps; ++i)
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
@@ -35714,15 +36401,26 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
}
}
- // TODO - Remove this once we can handle the implicit zero-extension of
- // X86ISD::PEXTRW/X86ISD::PEXTRB in:
- // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
- // combineBasicSADPattern.
if (IsPextr) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(
SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
return SDValue(N, 0);
+
+ // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
+ if ((InputVector.getOpcode() == X86ISD::PINSRB ||
+ InputVector.getOpcode() == X86ISD::PINSRW) &&
+ InputVector.getOperand(2) == EltIdx) {
+ assert(SrcVT == InputVector.getOperand(0).getValueType() &&
+ "Vector type mismatch");
+ SDValue Scl = InputVector.getOperand(1);
+ Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
+ return DAG.getZExtOrTrunc(Scl, dl, VT);
+ }
+
+ // TODO - Remove this once we can handle the implicit zero-extension of
+ // X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad,
+ // combineHorizontalPredicateResult and combineBasicSADPattern.
return SDValue();
}
@@ -35832,6 +36530,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
// get simplified at node creation time)?
bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ // If both inputs are 0/undef, create a complete zero vector.
+ // FIXME: As noted above this should be handled by DAGCombiner/getNode.
+ if (TValIsAllZeros && FValIsAllZeros) {
+ if (VT.isFloatingPoint())
+ return DAG.getConstantFP(0.0, DL, VT);
+ return DAG.getConstant(0, DL, VT);
+ }
+
if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
// Invert the cond to not(cond) : xor(op,allones)=not(op)
@@ -36295,8 +37002,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Since SKX these selects have a proper lowering.
if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
CondVT.getVectorElementType() == MVT::i1 &&
- (ExperimentalVectorWideningLegalization ||
- VT.getVectorNumElements() > 4) &&
(VT.getVectorElementType() == MVT::i8 ||
VT.getVectorElementType() == MVT::i16)) {
Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
@@ -36358,6 +37063,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// subl %esi, $edi
// cmovsl %eax, %edi
if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+ Cond.hasOneUse() &&
DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
DAG.isEqualTo(RHS, Cond.getOperand(1))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -36508,6 +37214,12 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
return V;
+ // select(~Cond, X, Y) -> select(Cond, Y, X)
+ if (CondVT.getScalarType() != MVT::i1)
+ if (SDValue CondNot = IsNOT(Cond, DAG))
+ return DAG.getNode(N->getOpcode(), DL, VT,
+ DAG.getBitcast(CondVT, CondNot), RHS, LHS);
+
// Custom action for SELECT MMX
if (VT == MVT::x86mmx) {
LHS = DAG.getBitcast(MVT::i64, LHS);
@@ -36873,8 +37585,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
// We can't always do this as FCMOV only supports a subset of X86 cond.
if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
- SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
- Flags};
+ SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
+ Flags};
return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
}
}
@@ -36923,12 +37635,13 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
// Optimize cases that will turn into an LEA instruction. This requires
// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
- uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
- if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+ APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
+ assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
+ "Implicit constant truncation");
bool isFastMultiplier = false;
- if (Diff < 10) {
- switch ((unsigned char)Diff) {
+ if (Diff.ult(10)) {
+ switch (Diff.getZExtValue()) {
default: break;
case 1: // result = add base, cond
case 2: // result = lea base( , cond*2)
@@ -36943,7 +37656,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
}
if (isFastMultiplier) {
- APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
Cond = getSETCC(CC, Cond, DL ,DAG);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
@@ -36994,8 +37706,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
if (CC == X86::COND_E &&
CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
- SDValue Ops[] = { FalseOp, Cond.getOperand(0),
- DAG.getConstant(CC, DL, MVT::i8), Cond };
+ SDValue Ops[] = {FalseOp, Cond.getOperand(0),
+ DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
}
}
@@ -37029,10 +37741,11 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
CC1 = X86::GetOppositeBranchCondition(CC1);
}
- SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
- Flags};
+ SDValue LOps[] = {FalseOp, TrueOp,
+ DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
- SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
+ SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
+ Flags};
SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
return CMOV;
}
@@ -37064,9 +37777,9 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
// This should constant fold.
SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
- SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
- DAG.getConstant(X86::COND_NE, DL, MVT::i8),
- Cond);
+ SDValue CMov =
+ DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
+ DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
}
}
@@ -37166,98 +37879,45 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
if ((NumElts % 2) != 0)
return SDValue();
- unsigned RegSize = 128;
- MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
// Shrink the operands of mul.
SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
- if (ExperimentalVectorWideningLegalization ||
- NumElts >= OpsVT.getVectorNumElements()) {
- // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
- // lower part is needed.
- SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
- if (Mode == MULU8 || Mode == MULS8)
- return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
- DL, VT, MulLo);
-
- MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
- // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
- // the higher part is also needed.
- SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
- ReducedVT, NewN0, NewN1);
-
- // Repack the lower part and higher part result of mul into a wider
- // result.
- // Generate shuffle functioning as punpcklwd.
- SmallVector<int, 16> ShuffleMask(NumElts);
- for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
- ShuffleMask[2 * i] = i;
- ShuffleMask[2 * i + 1] = i + NumElts;
- }
- SDValue ResLo =
- DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
- ResLo = DAG.getBitcast(ResVT, ResLo);
- // Generate shuffle functioning as punpckhwd.
- for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
- ShuffleMask[2 * i] = i + NumElts / 2;
- ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
- }
- SDValue ResHi =
- DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
- ResHi = DAG.getBitcast(ResVT, ResHi);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
- }
-
- // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
- // to legalize the mul explicitly because implicit legalization for type
- // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
- // instructions which will not exist when we explicitly legalize it by
- // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
- // <4 x i16> undef).
- //
- // Legalize the operands of mul.
- // FIXME: We may be able to handle non-concatenated vectors by insertion.
- unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
- if ((RegSize % ReducedSizeInBits) != 0)
- return SDValue();
-
- SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
- DAG.getUNDEF(ReducedVT));
- Ops[0] = NewN0;
- NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
- Ops[0] = NewN1;
- NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
-
- if (Mode == MULU8 || Mode == MULS8) {
- // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
- // part is needed.
- SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
-
- // convert the type of mul result to VT.
- MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
- SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
- : ISD::SIGN_EXTEND_VECTOR_INREG,
- DL, ResVT, Mul);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
- DAG.getIntPtrConstant(0, DL));
- }
+ // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
+ // lower part is needed.
+ SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
+ if (Mode == MULU8 || Mode == MULS8)
+ return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
+ DL, VT, MulLo);
- // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
- // MULU16/MULS16, both parts are needed.
- SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
+ MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
+ // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
+ // the higher part is also needed.
SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
- OpsVT, NewN0, NewN1);
+ ReducedVT, NewN0, NewN1);
// Repack the lower part and higher part result of mul into a wider
- // result. Make sure the type of mul result is VT.
- MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
- SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
- Res = DAG.getBitcast(ResVT, Res);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
- DAG.getIntPtrConstant(0, DL));
+ // result.
+ // Generate shuffle functioning as punpcklwd.
+ SmallVector<int, 16> ShuffleMask(NumElts);
+ for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
+ ShuffleMask[2 * i] = i;
+ ShuffleMask[2 * i + 1] = i + NumElts;
+ }
+ SDValue ResLo =
+ DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+ ResLo = DAG.getBitcast(ResVT, ResLo);
+ // Generate shuffle functioning as punpckhwd.
+ for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
+ ShuffleMask[2 * i] = i + NumElts / 2;
+ ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
+ }
+ SDValue ResHi =
+ DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+ ResHi = DAG.getBitcast(ResVT, ResHi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
}
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
@@ -37365,8 +38025,7 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
// Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
// Also allow v2i32 if it will be widened.
MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
- if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) ||
- DAG.getTargetLoweringInfo().isTypeLegal(WVT)))
+ if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))
return SDValue();
SDValue N0 = N->getOperand(0);
@@ -37919,7 +38578,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
if (NewShiftVal >= NumBitsPerElt)
NewShiftVal = NumBitsPerElt - 1;
return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
- DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8));
+ DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
}
// We can decode 'whole byte' logical bit shifts as shuffles.
@@ -38039,7 +38698,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasAVX512()) {
SDValue FSetCC =
DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
- DAG.getConstant(x86cc, DL, MVT::i8));
+ DAG.getTargetConstant(x86cc, DL, MVT::i8));
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
@@ -38048,10 +38707,9 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
N->getSimpleValueType(0));
}
- SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
- CMP00.getValueType(), CMP00, CMP01,
- DAG.getConstant(x86cc, DL,
- MVT::i8));
+ SDValue OnesOrZeroesF =
+ DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
+ CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
bool is64BitFP = (CMP00.getValueType() == MVT::f64);
MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
@@ -38083,34 +38741,6 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// Match (xor X, -1) -> X.
-// Match extract_subvector(xor X, -1) -> extract_subvector(X).
-// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
-static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
- V = peekThroughBitcasts(V);
- if (V.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
- return V.getOperand(0);
- if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
- if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
- Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
- Not, V.getOperand(1));
- }
- }
- SmallVector<SDValue, 2> CatOps;
- if (collectConcatOps(V.getNode(), CatOps)) {
- for (SDValue &CatOp : CatOps) {
- SDValue NotCat = IsNOT(CatOp, DAG);
- if (!NotCat) return SDValue();
- CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
- }
- return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
- }
- return SDValue();
-}
-
/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::AND);
@@ -38273,7 +38903,7 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
unsigned ShiftVal = SplatVal.countTrailingOnes();
- SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
+ SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
return DAG.getBitcast(N->getValueType(0), Shift);
}
@@ -38499,7 +39129,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
// TODO: Support multiple SrcOps.
if (VT == MVT::i1) {
SmallVector<SDValue, 2> SrcOps;
- if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
+ if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
SrcOps.size() == 1) {
SDLoc dl(N);
unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
@@ -38570,7 +39200,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
}
if (SDValue Shuffle = combineX86ShufflesRecursively(
- {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
+ {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
N->getOperand(0).getOperand(1));
@@ -38585,7 +39215,7 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
- EVT VT = N->getValueType(0);
+ MVT VT = N->getSimpleValueType(0);
if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
return SDValue();
@@ -38594,10 +39224,12 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
return SDValue();
- // On XOP we'll lower to PCMOV so accept one use, otherwise only
- // do this if either mask has multiple uses already.
- if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() ||
- !N1.getOperand(1).hasOneUse()))
+ // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
+ // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
+ bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
+ Subtarget.hasVLX();
+ if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
+ !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
return SDValue();
// Attempt to extract constant byte masks.
@@ -38895,6 +39527,24 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
DAG.getBitcast(MVT::v4f32, N1)));
}
+ // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
+ // TODO: Support multiple SrcOps.
+ if (VT == MVT::i1) {
+ SmallVector<SDValue, 2> SrcOps;
+ if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
+ SrcOps.size() == 1) {
+ SDLoc dl(N);
+ unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
+ EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
+ if (Mask) {
+ APInt AllBits = APInt::getNullValue(NumElts);
+ return DAG.getSetCC(dl, MVT::i1, Mask,
+ DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
+ }
+ }
+ }
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -39136,26 +39786,6 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
}
-/// Check if truncation with saturation form type \p SrcVT to \p DstVT
-/// is valid for the given \p Subtarget.
-static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
- const X86Subtarget &Subtarget) {
- if (!Subtarget.hasAVX512())
- return false;
-
- // FIXME: Scalar type may be supported if we move it to vector register.
- if (!SrcVT.isVector())
- return false;
-
- EVT SrcElVT = SrcVT.getScalarType();
- EVT DstElVT = DstVT.getScalarType();
- if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
- return false;
- if (SrcVT.is512BitVector() || Subtarget.hasVLX())
- return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
- return false;
-}
-
/// Detect patterns of truncation with unsigned saturation:
///
/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
@@ -39253,64 +39883,61 @@ static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
return SDValue();
}
-/// Detect a pattern of truncation with signed saturation.
-/// The types should allow to use VPMOVSS* instruction on AVX512.
-/// Return the source value to be truncated or SDValue() if the pattern was not
-/// matched.
-static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
- const X86Subtarget &Subtarget,
- const TargetLowering &TLI) {
- if (!TLI.isTypeLegal(In.getValueType()))
- return SDValue();
- if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
- return SDValue();
- return detectSSatPattern(In, VT);
-}
-
-/// Detect a pattern of truncation with saturation:
-/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
-/// The types should allow to use VPMOVUS* instruction on AVX512.
-/// Return the source value to be truncated or SDValue() if the pattern was not
-/// matched.
-static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
- const SDLoc &DL,
- const X86Subtarget &Subtarget,
- const TargetLowering &TLI) {
- if (!TLI.isTypeLegal(In.getValueType()))
- return SDValue();
- if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
- return SDValue();
- return detectUSatPattern(In, VT, DAG, DL);
-}
-
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- EVT SVT = VT.getScalarType();
+ if (!Subtarget.hasSSE2() || !VT.isVector())
+ return SDValue();
+
+ EVT SVT = VT.getVectorElementType();
EVT InVT = In.getValueType();
- EVT InSVT = InVT.getScalarType();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
- isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
- if (auto SSatVal = detectSSatPattern(In, VT))
- return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
- if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
- return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
- }
- if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
- !Subtarget.hasAVX512() &&
+ EVT InSVT = InVT.getVectorElementType();
+
+ // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
+ // split across two registers. We can use a packusdw+perm to clamp to 0-65535
+ // and concatenate at the same time. Then we can use a final vpmovuswb to
+ // clip to 0-255.
+ if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
+ InVT == MVT::v16i32 && VT == MVT::v16i8) {
+ if (auto USatVal = detectSSatPattern(In, VT, true)) {
+ // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
+ SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
+ DL, DAG, Subtarget);
+ assert(Mid && "Failed to pack!");
+ return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
+ }
+ }
+
+ // vXi32 truncate instructions are available with AVX512F.
+ // vXi16 truncate instructions are only available with AVX512BW.
+ // For 256-bit or smaller vectors, we require VLX.
+ // FIXME: We could widen truncates to 512 to remove the VLX restriction.
+ // If the result type is 256-bits or larger and we have disable 512-bit
+ // registers, we should go ahead and use the pack instructions if possible.
+ bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
+ (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
+ (InVT.getSizeInBits() > 128) &&
+ (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
+ !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
+
+ if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
+ VT.getSizeInBits() >= 64 &&
(SVT == MVT::i8 || SVT == MVT::i16) &&
(InSVT == MVT::i16 || InSVT == MVT::i32)) {
if (auto USatVal = detectSSatPattern(In, VT, true)) {
// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
+ // Only do this when the result is at least 64 bits or we'll leaving
+ // dangling PACKSSDW nodes.
if (SVT == MVT::i8 && InSVT == MVT::i32) {
EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
VT.getVectorNumElements());
SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
DAG, Subtarget);
- if (Mid)
- return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
- Subtarget);
+ assert(Mid && "Failed to pack!");
+ SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
+ Subtarget);
+ assert(V && "Failed to pack!");
+ return V;
} else if (SVT == MVT::i8 || Subtarget.hasSSE41())
return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
Subtarget);
@@ -39319,6 +39946,42 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
Subtarget);
}
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
+ Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
+ unsigned TruncOpc;
+ SDValue SatVal;
+ if (auto SSatVal = detectSSatPattern(In, VT)) {
+ SatVal = SSatVal;
+ TruncOpc = X86ISD::VTRUNCS;
+ } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
+ SatVal = USatVal;
+ TruncOpc = X86ISD::VTRUNCUS;
+ }
+ if (SatVal) {
+ unsigned ResElts = VT.getVectorNumElements();
+ // If the input type is less than 512 bits and we don't have VLX, we need
+ // to widen to 512 bits.
+ if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
+ unsigned NumConcats = 512 / InVT.getSizeInBits();
+ ResElts *= NumConcats;
+ SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
+ ConcatOps[0] = SatVal;
+ InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
+ NumConcats * InVT.getVectorNumElements());
+ SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
+ }
+ // Widen the result if its narrower than 128 bits.
+ if (ResElts * SVT.getSizeInBits() < 128)
+ ResElts = 128 / SVT.getSizeInBits();
+ EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
+ SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ }
+
return SDValue();
}
@@ -39377,7 +40040,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
return true;
};
- // Check if each element of the vector is left-shifted by one.
+ // Check if each element of the vector is right-shifted by one.
auto LHS = In.getOperand(0);
auto RHS = In.getOperand(1);
if (!IsConstVectorInRange(RHS, 1, 1))
@@ -39679,90 +40342,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
return Blend;
}
- if (Mld->getExtensionType() != ISD::EXTLOAD)
- return SDValue();
-
- // Resolve extending loads.
- EVT VT = Mld->getValueType(0);
- unsigned NumElems = VT.getVectorNumElements();
- EVT LdVT = Mld->getMemoryVT();
- SDLoc dl(Mld);
-
- assert(LdVT != VT && "Cannot extend to the same type");
- unsigned ToSz = VT.getScalarSizeInBits();
- unsigned FromSz = LdVT.getScalarSizeInBits();
- // From/To sizes and ElemCount must be pow of two.
- assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
- "Unexpected size for extending masked load");
-
- unsigned SizeRatio = ToSz / FromSz;
- assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
-
- // Create a type on which we perform the shuffle.
- EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
- LdVT.getScalarType(), NumElems*SizeRatio);
- assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
-
- // Convert PassThru value.
- SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru());
- if (!Mld->getPassThru().isUndef()) {
- SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i] = i * SizeRatio;
-
- // Can't shuffle using an illegal type.
- assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
- "WideVecVT should be legal");
- WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru,
- DAG.getUNDEF(WideVecVT), ShuffleVec);
- }
-
- // Prepare the new mask.
- SDValue NewMask;
- SDValue Mask = Mld->getMask();
- if (Mask.getValueType() == VT) {
- // Mask and original value have the same type.
- NewMask = DAG.getBitcast(WideVecVT, Mask);
- SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i] = i * SizeRatio;
- for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
- ShuffleVec[i] = NumElems * SizeRatio;
- NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
- DAG.getConstant(0, dl, WideVecVT),
- ShuffleVec);
- } else {
- assert(Mask.getValueType().getVectorElementType() == MVT::i1);
- unsigned WidenNumElts = NumElems*SizeRatio;
- unsigned MaskNumElts = VT.getVectorNumElements();
- EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- WidenNumElts);
-
- unsigned NumConcat = WidenNumElts / MaskNumElts;
- SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
- SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
- Ops[0] = Mask;
- NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
- }
-
- SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
- Mld->getBasePtr(), NewMask, WidePassThru,
- Mld->getMemoryVT(), Mld->getMemOperand(),
- ISD::NON_EXTLOAD);
-
- SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd);
- SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i * SizeRatio] = i;
-
- // Can't shuffle using an illegal type.
- assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
- "WideVecVT should be legal");
- SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
- DAG.getUNDEF(WideVecVT), ShuffleVec);
- SlicedVec = DAG.getBitcast(VT, SlicedVec);
-
- return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true);
+ return SDValue();
}
/// If exactly one element of the mask is set for a non-truncating masked store,
@@ -39800,123 +40380,45 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT VT = Mst->getValue().getValueType();
- EVT StVT = Mst->getMemoryVT();
SDLoc dl(Mst);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (!Mst->isTruncatingStore()) {
- if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
- return ScalarStore;
-
- // If the mask value has been legalized to a non-boolean vector, try to
- // simplify ops leading up to it. We only demand the MSB of each lane.
- SDValue Mask = Mst->getMask();
- if (Mask.getScalarValueSizeInBits() != 1) {
- APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
- if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
- return SDValue(N, 0);
- }
-
- // TODO: AVX512 targets should also be able to simplify something like the
- // pattern above, but that pattern will be different. It will either need to
- // match setcc more generally or match PCMPGTM later (in tablegen?).
-
- SDValue Value = Mst->getValue();
- if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
- TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
- Mst->getMemoryVT())) {
- return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
- Mst->getBasePtr(), Mask,
- Mst->getMemoryVT(), Mst->getMemOperand(), true);
- }
-
+ if (Mst->isTruncatingStore())
return SDValue();
- }
-
- // Resolve truncating stores.
- unsigned NumElems = VT.getVectorNumElements();
- assert(StVT != VT && "Cannot truncate to the same type");
- unsigned FromSz = VT.getScalarSizeInBits();
- unsigned ToSz = StVT.getScalarSizeInBits();
-
- // The truncating store is legal in some cases. For example
- // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
- // are designated for truncate store.
- // In this case we don't need any further transformations.
- if (TLI.isTruncStoreLegal(VT, StVT))
- return SDValue();
+ if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
+ return ScalarStore;
- // From/To sizes and ElemCount must be pow of two.
- assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
- "Unexpected size for truncating masked store");
- // We are going to use the original vector elt for storing.
- // Accumulated smaller vector elements must be a multiple of the store size.
- assert (((NumElems * FromSz) % ToSz) == 0 &&
- "Unexpected ratio for truncating masked store");
-
- unsigned SizeRatio = FromSz / ToSz;
- assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
-
- // Create a type on which we perform the shuffle.
- EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
- StVT.getScalarType(), NumElems*SizeRatio);
-
- assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
-
- SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
- SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i] = i * SizeRatio;
-
- // Can't shuffle using an illegal type.
- assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
- "WideVecVT should be legal");
-
- SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
- DAG.getUNDEF(WideVecVT),
- ShuffleVec);
-
- SDValue NewMask;
+ // If the mask value has been legalized to a non-boolean vector, try to
+ // simplify ops leading up to it. We only demand the MSB of each lane.
SDValue Mask = Mst->getMask();
- if (Mask.getValueType() == VT) {
- // Mask and original value have the same type.
- NewMask = DAG.getBitcast(WideVecVT, Mask);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i] = i * SizeRatio;
- for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
- ShuffleVec[i] = NumElems*SizeRatio;
- NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
- DAG.getConstant(0, dl, WideVecVT),
- ShuffleVec);
- } else {
- assert(Mask.getValueType().getVectorElementType() == MVT::i1);
- unsigned WidenNumElts = NumElems*SizeRatio;
- unsigned MaskNumElts = VT.getVectorNumElements();
- EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- WidenNumElts);
+ if (Mask.getScalarValueSizeInBits() != 1) {
+ APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+ return SDValue(N, 0);
+ }
- unsigned NumConcat = WidenNumElts / MaskNumElts;
- SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
- SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
- Ops[0] = Mask;
- NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ SDValue Value = Mst->getValue();
+ if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
+ TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
+ Mst->getMemoryVT())) {
+ return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
+ Mst->getBasePtr(), Mask,
+ Mst->getMemoryVT(), Mst->getMemOperand(), true);
}
- return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
- Mst->getBasePtr(), NewMask, StVT,
- Mst->getMemOperand(), false);
+ return SDValue();
}
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
StoreSDNode *St = cast<StoreSDNode>(N);
- EVT VT = St->getValue().getValueType();
EVT StVT = St->getMemoryVT();
SDLoc dl(St);
unsigned Alignment = St->getAlignment();
- SDValue StoredVal = St->getOperand(1);
+ SDValue StoredVal = St->getValue();
+ EVT VT = StoredVal.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Convert a store of vXi1 into a store of iX and a bitcast.
@@ -39986,8 +40488,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
St->getMemOperand()->getFlags());
}
- // If we are saving a concatenation of two XMM registers and 32-byte stores
- // are slow, such as on Sandy Bridge, perform two 16-byte stores.
+ // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
+ // Sandy Bridge, perform two 16-byte stores.
bool Fast;
if (VT.is256BitVector() && StVT == VT &&
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
@@ -40026,13 +40528,24 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
St->getValue().getOpcode() == ISD::TRUNCATE &&
St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
- TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) &&
- !DCI.isBeforeLegalizeOps()) {
+ TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
+ St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
MVT::v16i8, St->getMemOperand());
}
+ // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
+ if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
+ (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
+ StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
+ TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
+ bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
+ return EmitTruncSStore(IsSigned, St->getChain(),
+ dl, StoredVal.getOperand(0), St->getBasePtr(),
+ VT, St->getMemOperand(), DAG);
+ }
+
// Optimize trunc store (of multiple scalars) to shuffle and store.
// First, pack all of the elements in one place. Next, store to memory
// in fewer chunks.
@@ -40040,100 +40553,26 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// Check if we can detect an AVG pattern from the truncation. If yes,
// replace the trunc store by a normal store with the result of X86ISD::AVG
// instruction.
- if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
- Subtarget, dl))
- return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
- St->getMemOperand()->getFlags());
-
- if (SDValue Val =
- detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
- TLI))
- return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
- dl, Val, St->getBasePtr(),
- St->getMemoryVT(), St->getMemOperand(), DAG);
- if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
- DAG, dl, Subtarget, TLI))
- return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
- dl, Val, St->getBasePtr(),
- St->getMemoryVT(), St->getMemOperand(), DAG);
-
- unsigned NumElems = VT.getVectorNumElements();
- assert(StVT != VT && "Cannot truncate to the same type");
- unsigned FromSz = VT.getScalarSizeInBits();
- unsigned ToSz = StVT.getScalarSizeInBits();
-
- // The truncating store is legal in some cases. For example
- // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
- // are designated for truncate store.
- // In this case we don't need any further transformations.
- if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
- return SDValue();
-
- // From, To sizes and ElemCount must be pow of two
- if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
- // We are going to use the original vector elt for storing.
- // Accumulated smaller vector elements must be a multiple of the store size.
- if (0 != (NumElems * FromSz) % ToSz) return SDValue();
-
- unsigned SizeRatio = FromSz / ToSz;
-
- assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
-
- // Create a type on which we perform the shuffle
- EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
- StVT.getScalarType(), NumElems*SizeRatio);
-
- assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
-
- SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
- SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i] = i * SizeRatio;
+ if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
+ if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
+ Subtarget, dl))
+ return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
- // Can't shuffle using an illegal type.
- if (!TLI.isTypeLegal(WideVecVT))
- return SDValue();
-
- SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
- DAG.getUNDEF(WideVecVT),
- ShuffleVec);
- // At this point all of the data is stored at the bottom of the
- // register. We now need to save it to mem.
-
- // Find the largest store unit
- MVT StoreType = MVT::i8;
- for (MVT Tp : MVT::integer_valuetypes()) {
- if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
- StoreType = Tp;
- }
-
- // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
- if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
- (64 <= NumElems * ToSz))
- StoreType = MVT::f64;
-
- // Bitcast the original vector into a vector of store-size units
- EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
- StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
- assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
- SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
- SmallVector<SDValue, 8> Chains;
- SDValue Ptr = St->getBasePtr();
-
- // Perform one or more big stores into memory.
- for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
- SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
- StoreType, ShuffWide,
- DAG.getIntPtrConstant(i, dl));
- SDValue Ch =
- DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
- St->getAlignment(), St->getMemOperand()->getFlags());
- Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
- Chains.push_back(Ch);
+ if (TLI.isTruncStoreLegal(VT, StVT)) {
+ if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
+ return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
+ dl, Val, St->getBasePtr(),
+ St->getMemoryVT(), St->getMemOperand(), DAG);
+ if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
+ DAG, dl))
+ return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
+ dl, Val, St->getBasePtr(),
+ St->getMemoryVT(), St->getMemOperand(), DAG);
}
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+ return SDValue();
}
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
@@ -40149,11 +40588,10 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal =
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
- if (((VT.isVector() && !VT.isFloatingPoint()) ||
- (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
+ if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
isa<LoadSDNode>(St->getValue()) &&
- !cast<LoadSDNode>(St->getValue())->isVolatile() &&
- St->getChain().hasOneUse() && !St->isVolatile()) {
+ cast<LoadSDNode>(St->getValue())->isSimple() &&
+ St->getChain().hasOneUse() && St->isSimple()) {
LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
SmallVector<SDValue, 8> Ops;
@@ -40595,8 +41033,8 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- // Requires SSE2 but AVX512 has fast truncate.
- if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+ // Requires SSE2.
+ if (!Subtarget.hasSSE2())
return SDValue();
if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
@@ -40620,6 +41058,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
return SDValue();
+ // AVX512 has fast truncate, but if the input is already going to be split,
+ // there's no harm in trying pack.
+ if (Subtarget.hasAVX512() &&
+ !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
+ InVT.is512BitVector()))
+ return SDValue();
+
unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
@@ -40658,9 +41103,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
// Only handle vXi16 types that are at least 128-bits unless they will be
// widened.
- if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
- (!ExperimentalVectorWideningLegalization &&
- VT.getVectorNumElements() < 8))
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
return SDValue();
// Input type should be vXi32.
@@ -40874,6 +41317,19 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
return combineVectorTruncation(N, DAG, Subtarget);
}
+static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ SDValue In = N->getOperand(0);
+ SDLoc DL(N);
+
+ if (auto SSatVal = detectSSatPattern(In, VT))
+ return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
+ if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
+ return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+
+ return SDValue();
+}
+
/// Returns the negated value if the node \p N flips sign of FP value.
///
/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
@@ -40883,10 +41339,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
/// In this case we go though all bitcasts.
/// This also recognizes splat of a negated value and returns the splat of that
/// value.
-static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
+static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
if (N->getOpcode() == ISD::FNEG)
return N->getOperand(0);
+ // Don't recurse exponentially.
+ if (Depth > SelectionDAG::MaxRecursionDepth)
+ return SDValue();
+
unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
SDValue Op = peekThroughBitcasts(SDValue(N, 0));
@@ -40900,7 +41360,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
if (!SVOp->getOperand(1).isUndef())
return SDValue();
- if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode()))
+ if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode(), Depth + 1))
if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
SVOp->getMask());
@@ -40914,7 +41374,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
SDValue InsVal = Op.getOperand(1);
if (!InsVector.isUndef())
return SDValue();
- if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode()))
+ if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
NegInsVal, Op.getOperand(2));
@@ -40951,6 +41411,57 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
return SDValue();
}
+static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
+ bool NegRes) {
+ if (NegMul) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::FMA: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMADD: Opcode = ISD::FMA; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
+ }
+ }
+
+ if (NegAcc) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::FMA: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FMSUB: Opcode = ISD::FMA; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
+ case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
+ case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
+ case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
+ }
+ }
+
+ if (NegRes) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
+ }
+ }
+
+ return Opcode;
+}
+
/// Do target-specific dag combines on floating point negations.
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -40980,29 +41491,123 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
// If we're negating an FMA node, then we can adjust the
// instruction to include the extra negation.
- unsigned NewOpcode = 0;
if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
switch (Arg.getOpcode()) {
- case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
- case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
- case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
- case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
- case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
- case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
- // We can't handle scalar intrinsic node here because it would only
- // invert one element and not the whole vector. But we could try to handle
- // a negation of the lower element only.
- }
- }
- if (NewOpcode)
- return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
- Arg.getNode()->ops()));
+ case ISD::FMA:
+ case X86ISD::FMSUB:
+ case X86ISD::FNMADD:
+ case X86ISD::FNMSUB:
+ case X86ISD::FMADD_RND:
+ case X86ISD::FMSUB_RND:
+ case X86ISD::FNMADD_RND:
+ case X86ISD::FNMSUB_RND: {
+ // We can't handle scalar intrinsic node here because it would only
+ // invert one element and not the whole vector. But we could try to handle
+ // a negation of the lower element only.
+ unsigned NewOpcode = negateFMAOpcode(Arg.getOpcode(), false, false, true);
+ return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg->ops()));
+ }
+ }
+ }
return SDValue();
}
+char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
+ bool LegalOperations,
+ bool ForCodeSize,
+ unsigned Depth) const {
+ // fneg patterns are removable even if they have multiple uses.
+ if (isFNEG(DAG, Op.getNode(), Depth))
+ return 2;
+
+ // Don't recurse exponentially.
+ if (Depth > SelectionDAG::MaxRecursionDepth)
+ return 0;
+
+ EVT VT = Op.getValueType();
+ EVT SVT = VT.getScalarType();
+ switch (Op.getOpcode()) {
+ case ISD::FMA:
+ case X86ISD::FMSUB:
+ case X86ISD::FNMADD:
+ case X86ISD::FNMSUB:
+ case X86ISD::FMADD_RND:
+ case X86ISD::FMSUB_RND:
+ case X86ISD::FNMADD_RND:
+ case X86ISD::FNMSUB_RND: {
+ if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
+ !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
+ break;
+
+ // This is always negatible for free but we might be able to remove some
+ // extra operand negations as well.
+ for (int i = 0; i != 3; ++i) {
+ char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
+ ForCodeSize, Depth + 1);
+ if (V == 2)
+ return V;
+ }
+ return 1;
+ }
+ }
+
+ return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations,
+ ForCodeSize, Depth);
+}
+
+SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+ bool LegalOperations,
+ bool ForCodeSize,
+ unsigned Depth) const {
+ // fneg patterns are removable even if they have multiple uses.
+ if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth))
+ return DAG.getBitcast(Op.getValueType(), Arg);
+
+ EVT VT = Op.getValueType();
+ EVT SVT = VT.getScalarType();
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case ISD::FMA:
+ case X86ISD::FMSUB:
+ case X86ISD::FNMADD:
+ case X86ISD::FNMSUB:
+ case X86ISD::FMADD_RND:
+ case X86ISD::FMSUB_RND:
+ case X86ISD::FNMADD_RND:
+ case X86ISD::FNMSUB_RND: {
+ if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
+ !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
+ break;
+
+ // This is always negatible for free but we might be able to remove some
+ // extra operand negations as well.
+ SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
+ for (int i = 0; i != 3; ++i) {
+ char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
+ ForCodeSize, Depth + 1);
+ if (V == 2)
+ NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations,
+ ForCodeSize, Depth + 1);
+ }
+
+ bool NegA = !!NewOps[0];
+ bool NegB = !!NewOps[1];
+ bool NegC = !!NewOps[2];
+ unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
+
+ // Fill in the non-negated ops with the original values.
+ for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
+ if (!NewOps[i])
+ NewOps[i] = Op.getOperand(i);
+ return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
+ }
+ }
+
+ return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
+ ForCodeSize, Depth);
+}
+
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = N->getSimpleValueType(0);
@@ -41312,8 +41917,8 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
- // Unless the load is volatile.
- if (!LN->isVolatile()) {
+ // Unless the load is volatile or atomic.
+ if (LN->isSimple()) {
SDLoc dl(N);
unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
MVT MemVT = MVT::getIntegerVT(NumBits);
@@ -41347,8 +41952,8 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
- // Unless the load is volatile.
- if (!LN->isVolatile()) {
+ // Unless the load is volatile or atomic.
+ if (LN->isSimple()) {
SDLoc dl(N);
unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
MVT MemVT = MVT::getFloatingPointVT(NumBits);
@@ -41724,127 +42329,6 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(EltSizeInBits - 1, DL, VT));
}
-/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
-/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
-/// with UNDEFs) of the input to vectors of the same size as the target type
-/// which then extends the lowest elements.
-static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- if (ExperimentalVectorWideningLegalization)
- return SDValue();
-
- unsigned Opcode = N->getOpcode();
- // TODO - add ANY_EXTEND support.
- if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
- return SDValue();
- if (!DCI.isBeforeLegalizeOps())
- return SDValue();
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- SDValue N0 = N->getOperand(0);
- EVT VT = N->getValueType(0);
- EVT SVT = VT.getScalarType();
- EVT InVT = N0.getValueType();
- EVT InSVT = InVT.getScalarType();
-
- // FIXME: Generic DAGCombiner previously had a bug that would cause a
- // sign_extend of setcc to sometimes return the original node and tricked it
- // into thinking CombineTo was used which prevented the target combines from
- // running.
- // Earlying out here to avoid regressions like this
- // (v4i32 (sext (v4i1 (setcc (v4i16)))))
- // Becomes
- // (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef))))
- // Type legalized to
- // (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32)))))))
- // Leading to a packssdw+pmovsxwd
- // We could write a DAG combine to fix this, but really we shouldn't be
- // creating sext_invec that's forcing v8i16 into the DAG.
- if (N0.getOpcode() == ISD::SETCC)
- return SDValue();
-
- // Input type must be a vector and we must be extending legal integer types.
- if (!VT.isVector() || VT.getVectorNumElements() < 2)
- return SDValue();
- if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
- return SDValue();
- if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
- return SDValue();
-
- // If the input/output types are both legal then we have at least AVX1 and
- // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly.
- if (DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
- DAG.getTargetLoweringInfo().isTypeLegal(InVT))
- return SDValue();
-
- SDLoc DL(N);
-
- auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
- EVT SrcVT = N.getValueType();
- EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
- Size / SrcVT.getScalarSizeInBits());
- SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(),
- DAG.getUNDEF(SrcVT));
- Opnds[0] = N;
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds);
- };
-
- // If target-size is less than 128-bits, extend to a type that would extend
- // to 128 bits, extend that and extract the original target vector.
- if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
- unsigned Scale = 128 / VT.getSizeInBits();
- EVT ExVT =
- EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
- SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
- SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
- DAG.getIntPtrConstant(0, DL));
- }
-
- // If target-size is 128-bits (or 256-bits on AVX target), then convert to
- // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
- // Also use this if we don't have SSE41 to allow the legalizer do its job.
- if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
- (VT.is256BitVector() && Subtarget.hasAVX()) ||
- (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
- SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
- Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
- return DAG.getNode(Opcode, DL, VT, ExOp);
- }
-
- auto SplitAndExtendInReg = [&](unsigned SplitSize) {
- unsigned NumVecs = VT.getSizeInBits() / SplitSize;
- unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
- EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
- EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
-
- unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode);
- SmallVector<SDValue, 8> Opnds;
- for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
- SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
- DAG.getIntPtrConstant(Offset, DL));
- SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
- SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec);
- Opnds.push_back(SrcVec);
- }
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
- };
-
- // On pre-AVX targets, split into 128-bit nodes of
- // ISD::*_EXTEND_VECTOR_INREG.
- if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128))
- return SplitAndExtendInReg(128);
-
- // On pre-AVX512 targets, split into 256-bit nodes of
- // ISD::*_EXTEND_VECTOR_INREG.
- if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
- return SplitAndExtendInReg(256);
-
- return SDValue();
-}
-
// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
// result type.
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
@@ -41915,9 +42399,6 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
}
- if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
- return V;
-
if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
return V;
@@ -41931,45 +42412,15 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
- if (NegMul) {
- switch (Opcode) {
- default: llvm_unreachable("Unexpected opcode");
- case ISD::FMA: Opcode = X86ISD::FNMADD; break;
- case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
- case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FNMADD: Opcode = ISD::FMA; break;
- case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
- case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
- case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
- }
- }
-
- if (NegAcc) {
- switch (Opcode) {
- default: llvm_unreachable("Unexpected opcode");
- case ISD::FMA: Opcode = X86ISD::FMSUB; break;
- case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
- case X86ISD::FMSUB: Opcode = ISD::FMA; break;
- case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
- case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
- case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
- case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
- }
- }
-
- return Opcode;
-}
-
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
// Let legalize expand this if it isn't a legal type yet.
- if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(VT))
return SDValue();
EVT ScalarVT = VT.getScalarType();
@@ -41980,17 +42431,21 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
SDValue B = N->getOperand(1);
SDValue C = N->getOperand(2);
- auto invertIfNegative = [&DAG](SDValue &V) {
- if (SDValue NegVal = isFNEG(DAG, V.getNode())) {
- V = DAG.getBitcast(V.getValueType(), NegVal);
+ auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
+ bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool LegalOperations = !DCI.isBeforeLegalizeOps();
+ if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) {
+ V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize);
return true;
}
// Look through extract_vector_elts. If it comes from an FNEG, create a
// new extract from the FNEG input.
if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isNullConstant(V.getOperand(1))) {
- if (SDValue NegVal = isFNEG(DAG, V.getOperand(0).getNode())) {
- NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
+ SDValue Vec = V.getOperand(0);
+ if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) {
+ SDValue NegVal =
+ TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize);
V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
NegVal, V.getOperand(1));
return true;
@@ -42009,7 +42464,8 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
if (!NegA && !NegB && !NegC)
return SDValue();
- unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
+ unsigned NewOpcode =
+ negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
if (N->getNumOperands() == 4)
return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
@@ -42017,33 +42473,27 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
}
// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
+// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+ TargetLowering::DAGCombinerInfo &DCI) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool LegalOperations = !DCI.isBeforeLegalizeOps();
- SDValue NegVal = isFNEG(DAG, N->getOperand(2).getNode());
- if (!NegVal)
+ SDValue N2 = N->getOperand(2);
+ if (TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize) != 2)
return SDValue();
- // FIXME: Should we bitcast instead?
- if (NegVal.getValueType() != VT)
- return SDValue();
-
- unsigned NewOpcode;
- switch (N->getOpcode()) {
- default: llvm_unreachable("Unexpected opcode!");
- case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
- case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
- case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
- case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
- }
+ SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize);
+ unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
if (N->getNumOperands() == 4)
return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
- NegVal, N->getOperand(3));
+ NegN2, N->getOperand(3));
return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
- NegVal);
+ NegN2);
}
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
@@ -42090,9 +42540,6 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
return V;
- if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
- return V;
-
if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
return V;
@@ -42111,12 +42558,11 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
- unsigned NumSrcElts = N00.getValueType().getVectorNumElements();
unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
(N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
- return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128);
+ return concatSubVectors(N00, N01, DAG, dl);
}
}
@@ -42159,16 +42605,30 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
!IsOrXorXorCCZero)
return SDValue();
- // TODO: Use PXOR + PTEST for SSE4.1 or later?
EVT VT = SetCC->getValueType(0);
SDLoc DL(SetCC);
+ bool HasAVX = Subtarget.hasAVX();
+
+ // Use XOR (plus OR) and PTEST after SSE4.1 and before AVX512.
+ // Otherwise use PCMPEQ (plus AND) and mask testing.
if ((OpSize == 128 && Subtarget.hasSSE2()) ||
- (OpSize == 256 && Subtarget.hasAVX2()) ||
+ (OpSize == 256 && HasAVX) ||
(OpSize == 512 && Subtarget.useAVX512Regs())) {
- EVT VecVT = OpSize == 512 ? MVT::v16i32 :
- OpSize == 256 ? MVT::v32i8 :
- MVT::v16i8;
- EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT;
+ bool HasPT = Subtarget.hasSSE41();
+ EVT VecVT = MVT::v16i8;
+ EVT CmpVT = MVT::v16i8;
+ if (OpSize == 256)
+ VecVT = CmpVT = MVT::v32i8;
+ if (OpSize == 512) {
+ if (Subtarget.hasBWI()) {
+ VecVT = MVT::v64i8;
+ CmpVT = MVT::v64i1;
+ } else {
+ VecVT = MVT::v16i32;
+ CmpVT = MVT::v16i1;
+ }
+ }
+
SDValue Cmp;
if (IsOrXorXorCCZero) {
// This is a bitwise-combined equality comparison of 2 pairs of vectors:
@@ -42179,18 +42639,38 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
- SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
- SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
- Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
+ if (VecVT == CmpVT && HasPT) {
+ SDValue Cmp1 = DAG.getNode(ISD::XOR, DL, VecVT, A, B);
+ SDValue Cmp2 = DAG.getNode(ISD::XOR, DL, VecVT, C, D);
+ Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp1, Cmp2);
+ } else {
+ SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
+ SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
+ Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
+ }
} else {
SDValue VecX = DAG.getBitcast(VecVT, X);
SDValue VecY = DAG.getBitcast(VecVT, Y);
- Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
+ if (VecVT == CmpVT && HasPT) {
+ Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
+ } else {
+ Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
+ }
}
// For 512-bits we want to emit a setcc that will lower to kortest.
- if (OpSize == 512)
- return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp),
- DAG.getConstant(0xFFFF, DL, MVT::i16), CC);
+ if (VecVT != CmpVT) {
+ EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : MVT::i16;
+ SDValue Mask = DAG.getAllOnesConstant(DL, KRegVT);
+ return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), Mask, CC);
+ }
+ if (HasPT) {
+ SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
+ Cmp);
+ SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
+ X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+ SDValue SetCC = getSETCC(X86CC, PT, DL, DAG);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0));
+ }
// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
@@ -42270,8 +42750,6 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
// go through type promotion to a 128-bit vector.
if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
VT.getVectorElementType() == MVT::i1 &&
- (ExperimentalVectorWideningLegalization ||
- VT.getVectorNumElements() > 4) &&
(OpVT.getVectorElementType() == MVT::i8 ||
OpVT.getVectorElementType() == MVT::i16)) {
SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
@@ -42289,7 +42767,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
}
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDValue Src = N->getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = N->getSimpleValueType(0);
@@ -42310,7 +42789,7 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
// Look through int->fp bitcasts that don't change the element width.
unsigned EltWidth = SrcVT.getScalarSizeInBits();
- if (Src.getOpcode() == ISD::BITCAST &&
+ if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
@@ -42334,71 +42813,123 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // With vector masks we only demand the upper bit of the mask.
+ SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
+ if (Mask.getScalarValueSizeInBits() != 1) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+ return SDValue(N, 0);
+ }
+
+ return SDValue();
+}
+
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+ TargetLowering::DAGCombinerInfo &DCI) {
SDLoc DL(N);
+ auto *GorS = cast<MaskedGatherScatterSDNode>(N);
+ SDValue Chain = GorS->getChain();
+ SDValue Index = GorS->getIndex();
+ SDValue Mask = GorS->getMask();
+ SDValue Base = GorS->getBasePtr();
+ SDValue Scale = GorS->getScale();
- if (DCI.isBeforeLegalizeOps()) {
- SDValue Index = N->getOperand(4);
- // Remove any sign extends from 32 or smaller to larger than 32.
- // Only do this before LegalizeOps in case we need the sign extend for
- // legalization.
- if (Index.getOpcode() == ISD::SIGN_EXTEND) {
- if (Index.getScalarValueSizeInBits() > 32 &&
- Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
- SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
- NewOps[4] = Index.getOperand(0);
- SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
- if (Res == N) {
- // The original sign extend has less users, add back to worklist in
- // case it needs to be removed
- DCI.AddToWorklist(Index.getNode());
- DCI.AddToWorklist(N);
+ if (DCI.isBeforeLegalize()) {
+ unsigned IndexWidth = Index.getScalarValueSizeInBits();
+
+ // Shrink constant indices if they are larger than 32-bits.
+ // Only do this before legalize types since v2i64 could become v2i32.
+ // FIXME: We could check that the type is legal if we're after legalize
+ // types, but then we would need to construct test cases where that happens.
+ // FIXME: We could support more than just constant vectors, but we need to
+ // careful with costing. A truncate that can be optimized out would be fine.
+ // Otherwise we might only want to create a truncate if it avoids a split.
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
+ if (BV->isConstant() && IndexWidth > 32 &&
+ DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
+ unsigned NumElts = Index.getValueType().getVectorNumElements();
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
+ if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+ SDValue Ops[] = { Chain, Gather->getPassThru(),
+ Mask, Base, Index, Scale } ;
+ return DAG.getMaskedGather(Gather->getVTList(),
+ Gather->getMemoryVT(), DL, Ops,
+ Gather->getMemOperand(),
+ Gather->getIndexType());
}
- return SDValue(Res, 0);
- }
+ auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+ SDValue Ops[] = { Chain, Scatter->getValue(),
+ Mask, Base, Index, Scale };
+ return DAG.getMaskedScatter(Scatter->getVTList(),
+ Scatter->getMemoryVT(), DL,
+ Ops, Scatter->getMemOperand(),
+ Scatter->getIndexType());
+ }
+ }
+
+ // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
+ // there are sufficient sign bits. Only do this before legalize types to
+ // avoid creating illegal types in truncate.
+ if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
+ Index.getOpcode() == ISD::ZERO_EXTEND) &&
+ IndexWidth > 32 &&
+ Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
+ DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
+ unsigned NumElts = Index.getValueType().getVectorNumElements();
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
+ if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+ SDValue Ops[] = { Chain, Gather->getPassThru(),
+ Mask, Base, Index, Scale } ;
+ return DAG.getMaskedGather(Gather->getVTList(),
+ Gather->getMemoryVT(), DL, Ops,
+ Gather->getMemOperand(),
+ Gather->getIndexType());
+ }
+ auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+ SDValue Ops[] = { Chain, Scatter->getValue(),
+ Mask, Base, Index, Scale };
+ return DAG.getMaskedScatter(Scatter->getVTList(),
+ Scatter->getMemoryVT(), DL,
+ Ops, Scatter->getMemOperand(),
+ Scatter->getIndexType());
}
+ }
+
+ if (DCI.isBeforeLegalizeOps()) {
+ unsigned IndexWidth = Index.getScalarValueSizeInBits();
// Make sure the index is either i32 or i64
- unsigned ScalarSize = Index.getScalarValueSizeInBits();
- if (ScalarSize != 32 && ScalarSize != 64) {
- MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
+ if (IndexWidth != 32 && IndexWidth != 64) {
+ MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
Index.getValueType().getVectorNumElements());
Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
- SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
- NewOps[4] = Index;
- SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
- if (Res == N)
- DCI.AddToWorklist(N);
- return SDValue(Res, 0);
- }
-
- // Try to remove zero extends from 32->64 if we know the sign bit of
- // the input is zero.
- if (Index.getOpcode() == ISD::ZERO_EXTEND &&
- Index.getScalarValueSizeInBits() == 64 &&
- Index.getOperand(0).getScalarValueSizeInBits() == 32) {
- if (DAG.SignBitIsZero(Index.getOperand(0))) {
- SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
- NewOps[4] = Index.getOperand(0);
- SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
- if (Res == N) {
- // The original sign extend has less users, add back to worklist in
- // case it needs to be removed
- DCI.AddToWorklist(Index.getNode());
- DCI.AddToWorklist(N);
- }
- return SDValue(Res, 0);
- }
- }
- }
-
- // With AVX2 we only demand the upper bit of the mask.
- if (!Subtarget.hasAVX512()) {
+ if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+ SDValue Ops[] = { Chain, Gather->getPassThru(),
+ Mask, Base, Index, Scale } ;
+ return DAG.getMaskedGather(Gather->getVTList(),
+ Gather->getMemoryVT(), DL, Ops,
+ Gather->getMemOperand(),
+ Gather->getIndexType());
+ }
+ auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+ SDValue Ops[] = { Chain, Scatter->getValue(),
+ Mask, Base, Index, Scale };
+ return DAG.getMaskedScatter(Scatter->getVTList(),
+ Scatter->getMemoryVT(), DL,
+ Ops, Scatter->getMemOperand(),
+ Scatter->getIndexType());
+ }
+ }
+
+ // With vector masks we only demand the upper bit of the mask.
+ if (Mask.getScalarValueSizeInBits() != 1) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue Mask = N->getOperand(2);
APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
return SDValue(N, 0);
@@ -42432,7 +42963,7 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
// Make sure to not keep references to operands, as combineSetCCEFLAGS can
// RAUW them under us.
if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
- SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
+ SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
N->getOperand(1), Cond, Flags);
}
@@ -42549,6 +43080,7 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
}
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
// First try to optimize away the conversion entirely when it's
// conditionally from a constant. Vectors only.
@@ -42578,13 +43110,22 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
unsigned BitWidth = InVT.getScalarSizeInBits();
unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
if (NumSignBits >= (BitWidth - 31)) {
- EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
+ EVT TruncVT = MVT::i32;
if (InVT.isVector())
TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
InVT.getVectorNumElements());
SDLoc dl(N);
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
- return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
+ if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
+ }
+ // If we're after legalize and the type is v2i32 we need to shuffle and
+ // use CVTSI2P.
+ assert(InVT == MVT::v2i64 && "Unexpected VT!");
+ SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
+ SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
+ { 0, 2, -1, -1 });
+ return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
}
}
@@ -42604,7 +43145,7 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasDQI() && VT != MVT::f80)
return SDValue();
- if (!Ld->isVolatile() && !VT.isVector() &&
+ if (Ld->isSimple() && !VT.isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
!Subtarget.is64Bit() && LdVT == MVT::i64) {
SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
@@ -42841,12 +43382,12 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
- SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
- DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getConstant(X86::COND_B, DL,
- MVT::i8),
- N->getOperand(2)),
- DAG.getConstant(1, DL, VT));
+ SDValue Res1 =
+ DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ N->getOperand(2)),
+ DAG.getConstant(1, DL, VT));
return DCI.CombineTo(N, Res1, CarryOut);
}
@@ -42906,7 +43447,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
Y.getOperand(1));
}
@@ -42924,7 +43465,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
EFLAGS.getOperand(1), EFLAGS.getOperand(0));
SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
NewEFLAGS);
}
}
@@ -42984,7 +43525,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
SDValue(Neg.getNode(), 1));
}
@@ -42997,7 +43538,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
SDValue One = DAG.getConstant(1, DL, ZVT);
SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1);
}
}
@@ -43025,9 +43566,6 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
if (!Subtarget.hasSSE2())
return SDValue();
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
-
EVT VT = N->getValueType(0);
// If the vector size is less than 128, or greater than the supported RegSize,
@@ -43035,14 +43573,27 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
if (!VT.isVector() || VT.getVectorNumElements() < 8)
return SDValue();
- if (Op0.getOpcode() != ISD::MUL)
- std::swap(Op0, Op1);
- if (Op0.getOpcode() != ISD::MUL)
- return SDValue();
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
- ShrinkMode Mode;
- if (!canReduceVMulWidth(Op0.getNode(), DAG, Mode) || Mode == MULU16)
- return SDValue();
+ auto UsePMADDWD = [&](SDValue Op) {
+ ShrinkMode Mode;
+ return Op.getOpcode() == ISD::MUL &&
+ canReduceVMulWidth(Op.getNode(), DAG, Mode) && Mode != MULU16 &&
+ (!Subtarget.hasSSE41() ||
+ (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
+ Op->isOnlyUserOf(Op.getOperand(1).getNode())));
+ };
+
+ SDValue MulOp, OtherOp;
+ if (UsePMADDWD(Op0)) {
+ MulOp = Op0;
+ OtherOp = Op1;
+ } else if (UsePMADDWD(Op1)) {
+ MulOp = Op1;
+ OtherOp = Op0;
+ } else
+ return SDValue();
SDLoc DL(N);
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
@@ -43050,34 +43601,27 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
VT.getVectorNumElements() / 2);
+ // Shrink the operands of mul.
+ SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
+
// Madd vector size is half of the original vector size
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
};
-
- auto BuildPMADDWD = [&](SDValue Mul) {
- // Shrink the operands of mul.
- SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(0));
- SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(1));
-
- SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
- PMADDWDBuilder);
- // Fill the rest of the output with 0
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd,
- DAG.getConstant(0, DL, MAddVT));
- };
-
- Op0 = BuildPMADDWD(Op0);
-
- // It's possible that Op1 is also a mul we can reduce.
- if (Op1.getOpcode() == ISD::MUL &&
- canReduceVMulWidth(Op1.getNode(), DAG, Mode) && Mode != MULU16) {
- Op1 = BuildPMADDWD(Op1);
- }
-
- return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
+ SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
+ PMADDWDBuilder);
+ // Fill the rest of the output with 0
+ SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
+
+ // Preserve the reduction flag on the ADD. We may need to revisit for the
+ // other operand.
+ SDNodeFlags Flags;
+ Flags.setVectorReduction(true);
+ return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
}
static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
@@ -43087,8 +43631,6 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
EVT VT = N->getValueType(0);
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
// TODO: There's nothing special about i32, any integer type above i16 should
// work just as well.
@@ -43108,80 +43650,53 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
if (VT.getSizeInBits() / 4 > RegSize)
return SDValue();
- // We know N is a reduction add, which means one of its operands is a phi.
- // To match SAD, we need the other operand to be a ABS.
- if (Op0.getOpcode() != ISD::ABS)
- std::swap(Op0, Op1);
- if (Op0.getOpcode() != ISD::ABS)
- return SDValue();
-
- auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) {
- // SAD pattern detected. Now build a SAD instruction and an addition for
- // reduction. Note that the number of elements of the result of SAD is less
- // than the number of elements of its input. Therefore, we could only update
- // part of elements in the reduction vector.
- SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
-
- // The output of PSADBW is a vector of i64.
- // We need to turn the vector of i64 into a vector of i32.
- // If the reduction vector is at least as wide as the psadbw result, just
- // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
- // anyway.
- MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
- if (VT.getSizeInBits() >= ResVT.getSizeInBits())
- Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
- else
- Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
-
- if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
- // Fill the upper elements with zero to match the add width.
- SDValue Zero = DAG.getConstant(0, DL, VT);
- Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
- DAG.getIntPtrConstant(0, DL));
- }
-
- return Sad;
- };
+ // We know N is a reduction add. To match SAD, we need one of the operands to
+ // be an ABS.
+ SDValue AbsOp = N->getOperand(0);
+ SDValue OtherOp = N->getOperand(1);
+ if (AbsOp.getOpcode() != ISD::ABS)
+ std::swap(AbsOp, OtherOp);
+ if (AbsOp.getOpcode() != ISD::ABS)
+ return SDValue();
// Check whether we have an abs-diff pattern feeding into the select.
SDValue SadOp0, SadOp1;
- if (!detectZextAbsDiff(Op0, SadOp0, SadOp1))
- return SDValue();
-
- Op0 = BuildPSADBW(SadOp0, SadOp1);
-
- // It's possible we have a sad on the other side too.
- if (Op1.getOpcode() == ISD::ABS &&
- detectZextAbsDiff(Op1, SadOp0, SadOp1)) {
- Op1 = BuildPSADBW(SadOp0, SadOp1);
- }
-
- return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
-}
-
-/// Convert vector increment or decrement to sub/add with an all-ones constant:
-/// add X, <1, 1...> --> sub X, <-1, -1...>
-/// sub X, <1, 1...> --> add X, <-1, -1...>
-/// The all-ones vector constant can be materialized using a pcmpeq instruction
-/// that is commonly recognized as an idiom (has no register dependency), so
-/// that's better/smaller than loading a splat 1 constant.
-static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
- assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
- "Unexpected opcode for increment/decrement transform");
-
- // Pseudo-legality check: getOnesVector() expects one of these types, so bail
- // out and wait for legalization if we have an unsupported vector length.
- EVT VT = N->getValueType(0);
- if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
- return SDValue();
-
- APInt SplatVal;
- if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue())
- return SDValue();
-
- SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
- unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
- return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
+ if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
+ return SDValue();
+
+ // SAD pattern detected. Now build a SAD instruction and an addition for
+ // reduction. Note that the number of elements of the result of SAD is less
+ // than the number of elements of its input. Therefore, we could only update
+ // part of elements in the reduction vector.
+ SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
+
+ // The output of PSADBW is a vector of i64.
+ // We need to turn the vector of i64 into a vector of i32.
+ // If the reduction vector is at least as wide as the psadbw result, just
+ // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
+ // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64
+ // result to v2i32 which will be removed by type legalization. If we/ widen
+ // narrow vectors then we bitcast to v4i32 and extract v2i32.
+ MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
+ Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
+
+ if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
+ // Fill the upper elements with zero to match the add width.
+ assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs");
+ unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
+ Ops[0] = Sad;
+ Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
+ } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
+ Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Preserve the reduction flag on the ADD. We may need to revisit for the
+ // other operand.
+ SDNodeFlags Flags;
+ Flags.setVectorReduction(true);
+ return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
}
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
@@ -43294,8 +43809,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
}
// Attempt to turn this pattern into PMADDWD.
-// (mul (add (zext (build_vector)), (zext (build_vector))),
-// (add (zext (build_vector)), (zext (build_vector)))
+// (mul (add (sext (build_vector)), (sext (build_vector))),
+// (add (sext (build_vector)), (sext (build_vector)))
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
const SDLoc &DL, EVT VT,
const X86Subtarget &Subtarget) {
@@ -43415,6 +43930,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
}
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
const SDNodeFlags Flags = N->getFlags();
if (Flags.hasVectorReduction()) {
@@ -43445,8 +43961,29 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
HADDBuilder);
}
- if (SDValue V = combineIncDecVector(N, DAG))
- return V;
+ // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
+ // (sub Y, (sext (vXi1 X))).
+ // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
+ // generic DAG combine without a legal type check, but adding this there
+ // caused regressions.
+ if (VT.isVector()) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
+ Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
+ TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
+ SDLoc DL(N);
+ SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
+ return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
+ }
+
+ if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
+ Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
+ TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
+ SDLoc DL(N);
+ SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
+ return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
+ }
+ }
return combineAddOrSubToADCOrSBB(N, DAG);
}
@@ -43457,13 +43994,15 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
SDValue Op1 = N->getOperand(1);
EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+
// PSUBUS is supported, starting from SSE2, but truncation for v8i32
// is only worth it with SSSE3 (PSHUFB).
- if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
+ EVT EltVT = VT.getVectorElementType();
+ if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) &&
!(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
- !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
- !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
- VT == MVT::v16i32 || VT == MVT::v8i64)))
+ !(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
return SDValue();
SDValue SubusLHS, SubusRHS;
@@ -43493,16 +44032,13 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
} else
return SDValue();
- auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
- ArrayRef<SDValue> Ops) {
- return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops);
- };
-
// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
// special preprocessing in some cases.
- if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
- return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
- { SubusLHS, SubusRHS }, USUBSATBuilder);
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);
+
+ assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) &&
+ "Unexpected VT!");
// Special preprocessing case can be only applied
// if the value was zero extended from 16 bit,
@@ -43531,15 +44067,16 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
SDValue NewSubusLHS =
DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
- SDValue Psubus =
- SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
- { NewSubusLHS, NewSubusRHS }, USUBSATBuilder);
+ SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
+ NewSubusLHS, NewSubusRHS);
+
// Zero extend the result, it may be used somewhere as 32 bit,
// if not zext and following trunc will shrink.
return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
}
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
@@ -43576,9 +44113,6 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
HSUBBuilder);
}
- if (SDValue V = combineIncDecVector(N, DAG))
- return V;
-
// Try to create PSUBUS if SUB's argument is max/min
if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
return V;
@@ -43712,14 +44246,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
}
- // If we're inserting all zeros into the upper half, change this to
- // an insert into an all zeros vector. We will match this to a move
- // with implicit upper bit zeroing during isel.
- if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode()))
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
- getZeroVector(VT, Subtarget, DAG, DL), Ops[0],
- DAG.getIntPtrConstant(0, DL));
-
return SDValue();
}
@@ -43786,10 +44312,10 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
// least as large as the original insertion. Just insert the original
// subvector into a zero vector.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
- SubVec.getConstantOperandAPInt(1) == 0 &&
+ isNullConstant(SubVec.getOperand(1)) &&
SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue Ins = SubVec.getOperand(0);
- if (Ins.getConstantOperandAPInt(2) == 0 &&
+ if (isNullConstant(Ins.getOperand(2)) &&
ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
@@ -43825,31 +44351,42 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
// Match concat_vector style patterns.
SmallVector<SDValue, 2> SubVectorOps;
- if (collectConcatOps(N, SubVectorOps))
+ if (collectConcatOps(N, SubVectorOps)) {
if (SDValue Fold =
combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
return Fold;
- // If we are inserting into both halves of the vector, the starting vector
- // should be undef. If it isn't, make it so. Only do this if the early insert
- // has no other uses.
- // TODO: Should this be a generic DAG combine?
- // TODO: Why doesn't SimplifyDemandedVectorElts catch this?
- if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
- Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
- OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 &&
- isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() &&
- Vec.hasOneUse()) {
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
- Vec.getOperand(1), Vec.getOperand(2));
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
- N->getOperand(2));
+ // If we're inserting all zeros into the upper half, change this to
+ // a concat with zero. We will match this to a move
+ // with implicit upper bit zeroing during isel.
+ // We do this here because we don't want combineConcatVectorOps to
+ // create INSERT_SUBVECTOR from CONCAT_VECTORS.
+ if (SubVectorOps.size() == 2 &&
+ ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+ getZeroVector(OpVT, Subtarget, DAG, dl),
+ SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
}
// If this is a broadcast insert into an upper undef, use a larger broadcast.
if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
+ // If this is a broadcast load inserted into an upper undef, use a larger
+ // broadcast load.
+ if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
+ SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
+ SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
+ SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
+ MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+
return SDValue();
}
@@ -43928,12 +44465,15 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
return SDValue();
MVT VT = N->getSimpleValueType(0);
- EVT WideVecVT = N->getOperand(0).getValueType();
- SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
+ SDValue InVec = N->getOperand(0);
+ SDValue InVecBC = peekThroughBitcasts(InVec);
+ EVT InVecVT = InVec.getValueType();
+ EVT InVecBCVT = InVecBC.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
- TLI.isTypeLegal(WideVecVT) &&
- WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) {
+ TLI.isTypeLegal(InVecVT) &&
+ InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) {
auto isConcatenatedNot = [] (SDValue V) {
V = peekThroughBitcasts(V);
if (!isBitwiseNot(V))
@@ -43941,12 +44481,12 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
SDValue NotOp = V->getOperand(0);
return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
};
- if (isConcatenatedNot(WideVec.getOperand(0)) ||
- isConcatenatedNot(WideVec.getOperand(1))) {
+ if (isConcatenatedNot(InVecBC.getOperand(0)) ||
+ isConcatenatedNot(InVecBC.getOperand(1))) {
// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
- SDValue Concat = split256IntArith(WideVec, DAG);
+ SDValue Concat = split256IntArith(InVecBC, DAG);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
- DAG.getBitcast(WideVecVT, Concat), N->getOperand(1));
+ DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
}
}
@@ -43956,7 +44496,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
if (SDValue V = narrowExtractedVectorSelect(N, DAG))
return V;
- SDValue InVec = N->getOperand(0);
unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
if (ISD::isBuildVectorAllZeros(InVec.getNode()))
@@ -43976,31 +44515,42 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
// Try to move vector bitcast after extract_subv by scaling extraction index:
// extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
// TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
- if (InVec.getOpcode() == ISD::BITCAST &&
- InVec.getOperand(0).getValueType().isVector()) {
- SDValue SrcOp = InVec.getOperand(0);
- EVT SrcVT = SrcOp.getValueType();
- unsigned SrcNumElts = SrcVT.getVectorNumElements();
- unsigned DestNumElts = InVec.getValueType().getVectorNumElements();
+ if (InVec != InVecBC && InVecBCVT.isVector()) {
+ unsigned SrcNumElts = InVecBCVT.getVectorNumElements();
+ unsigned DestNumElts = InVecVT.getVectorNumElements();
if ((DestNumElts % SrcNumElts) == 0) {
unsigned DestSrcRatio = DestNumElts / SrcNumElts;
if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
- SrcVT.getScalarType(), NewExtNumElts);
+ InVecBCVT.getScalarType(), NewExtNumElts);
if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
SDLoc DL(N);
SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
- SrcOp, NewIndex);
+ InVecBC, NewIndex);
return DAG.getBitcast(VT, NewExtract);
}
}
}
}
+ // If we are extracting from an insert into a zero vector, replace with a
+ // smaller insert into zero if we don't access less than the original
+ // subvector. Don't do this for i1 vectors.
+ if (VT.getVectorElementType() != MVT::i1 &&
+ InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
+ InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
+ ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
+ InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
+ SDLoc DL(N);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL),
+ InVec.getOperand(1), InVec.getOperand(2));
+ }
+
// If we're extracting from a broadcast then we're better off just
// broadcasting to the smaller type directly, assuming this is the only use.
// As its a broadcast we don't care about the extraction index.
@@ -44008,11 +44558,25 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
+ if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
+ if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
+ MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+ }
+
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
if (IdxVal == 0 && InVec.hasOneUse()) {
unsigned InOpcode = InVec.getOpcode();
- if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
+ if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
// v2f64 CVTDQ2PD(v4i32).
if (InOpcode == ISD::SINT_TO_FP &&
InVec.getOperand(0).getValueType() == MVT::v4i32) {
@@ -44093,7 +44657,8 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
// Simplify PMULDQ and PMULUDQ operations.
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
@@ -44103,23 +44668,43 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
// Multiply by zero.
+ // Don't return RHS as it may contain UNDEFs.
if (ISD::isBuildVectorAllZeros(RHS.getNode()))
- return RHS;
-
- // Aggressively peek through ops to get at the demanded low bits.
- APInt DemandedMask = APInt::getLowBitsSet(64, 32);
- SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask);
- SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask);
- if (DemandedLHS || DemandedRHS)
- return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
- DemandedLHS ? DemandedLHS : LHS,
- DemandedRHS ? DemandedRHS : RHS);
+ return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
return SDValue(N, 0);
+ // If the input is an extend_invec and the SimplifyDemandedBits call didn't
+ // convert it to any_extend_invec, due to the LegalOperations check, do the
+ // conversion directly to a vector shuffle manually. This exposes combine
+ // opportunities missed by combineExtInVec not calling
+ // combineX86ShufflesRecursively on SSE4.1 targets.
+ // FIXME: This is basically a hack around several other issues related to
+ // ANY_EXTEND_VECTOR_INREG.
+ if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
+ (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
+ LHS.getOperand(0).getValueType() == MVT::v4i32) {
+ SDLoc dl(N);
+ LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
+ LHS.getOperand(0), { 0, -1, 1, -1 });
+ LHS = DAG.getBitcast(MVT::v2i64, LHS);
+ return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
+ }
+ if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
+ (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
+ RHS.getOperand(0).getValueType() == MVT::v4i32) {
+ SDLoc dl(N);
+ RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
+ RHS.getOperand(0), { 0, -1, 1, -1 });
+ RHS = DAG.getBitcast(MVT::v2i64, RHS);
+ return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
+ }
+
return SDValue();
}
@@ -44134,7 +44719,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
In.hasOneUse()) {
auto *Ld = cast<LoadSDNode>(In);
- if (!Ld->isVolatile()) {
+ if (Ld->isSimple()) {
MVT SVT = In.getSimpleValueType().getVectorElementType();
ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
@@ -44150,17 +44735,6 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
}
}
- // Disabling for widening legalization for now. We can enable if we find a
- // case that needs it. Otherwise it can be deleted when we switch to
- // widening legalization.
- if (ExperimentalVectorWideningLegalization)
- return SDValue();
-
- // Combine (ext_invec (ext_invec X)) -> (ext_invec X)
- if (In.getOpcode() == N->getOpcode() &&
- TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType()))
- return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0));
-
// Attempt to combine as a shuffle.
// TODO: SSE41 support
if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
@@ -44173,6 +44747,20 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+
+ APInt KnownUndef, KnownZero;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+ if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
+ KnownZero, DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -44196,8 +44784,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
case X86ISD::CMP: return combineCMP(N, DAG);
- case ISD::ADD: return combineAdd(N, DAG, Subtarget);
- case ISD::SUB: return combineSub(N, DAG, Subtarget);
+ case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
+ case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
case X86ISD::ADD:
case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
case X86ISD::SBB: return combineSBB(N, DAG);
@@ -44214,12 +44802,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
- case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget);
case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
case ISD::FADD:
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
+ case X86ISD::VTRUNC: return combineVTRUNC(N, DAG);
case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
@@ -44299,20 +44888,22 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FNMADD_RND:
case X86ISD::FNMSUB:
case X86ISD::FNMSUB_RND:
- case ISD::FMA: return combineFMA(N, DAG, Subtarget);
+ case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget);
case X86ISD::FMADDSUB_RND:
case X86ISD::FMSUBADD_RND:
case X86ISD::FMADDSUB:
- case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
- case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
+ case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
+ case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
case X86ISD::MGATHER:
- case X86ISD::MSCATTER:
+ case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
case ISD::MGATHER:
- case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
+ case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
case X86ISD::PMULDQ:
- case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI);
+ case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
+ case X86ISD::KSHIFTL:
+ case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
}
return SDValue();
@@ -44660,10 +45251,11 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
case 'I':
case 'J':
case 'K':
- case 'L':
- case 'M':
case 'N':
case 'G':
+ case 'L':
+ case 'M':
+ return C_Immediate;
case 'C':
case 'e':
case 'Z':
@@ -45175,8 +45767,9 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::FR64XRegClass);
return std::make_pair(0U, &X86::FR64RegClass);
- // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
- // Vector types.
+ // TODO: Handle i128 in FR128RegClass after it is tested well.
+ // Vector types and fp128.
+ case MVT::f128:
case MVT::v16i8:
case MVT::v8i16:
case MVT::v4i32:
@@ -45469,7 +46062,7 @@ void X86TargetLowering::insertCopiesSplitCSR(
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
- unsigned NewVR = MRI->createVirtualRegister(RC);
+ Register NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
@@ -45514,3 +46107,16 @@ X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
}
+
+unsigned
+X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
+ // The default stack probe size is 4096 if the function has no stackprobesize
+ // attribute.
+ unsigned StackProbeSize = 4096;
+ const Function &Fn = MF.getFunction();
+ if (Fn.hasFnAttribute("stack-probe-size"))
+ Fn.getFnAttribute("stack-probe-size")
+ .getValueAsString()
+ .getAsInteger(0, StackProbeSize);
+ return StackProbeSize;
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index e0be03bc3f9d..6f7e90008de4 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -17,7 +17,6 @@
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/Target/TargetOptions.h"
namespace llvm {
class X86Subtarget;
@@ -144,6 +143,10 @@ namespace llvm {
/// relative displacements.
WrapperRIP,
+ /// Copies a 64-bit value from an MMX vector to the low word
+ /// of an XMM vector, with the high word zero filled.
+ MOVQ2DQ,
+
/// Copies a 64-bit value from the low word of an XMM vector
/// to an MMX vector.
MOVDQ2Q,
@@ -422,7 +425,8 @@ namespace llvm {
// Tests Types Of a FP Values for scalar types.
VFPCLASSS,
- // Broadcast scalar to vector.
+ // Broadcast (splat) scalar or element 0 of a vector. If the operand is
+ // a vector, this node may change the vector length as part of the splat.
VBROADCAST,
// Broadcast mask to vector.
VBROADCASTM,
@@ -611,6 +615,9 @@ namespace llvm {
// extract_vector_elt, store.
VEXTRACT_STORE,
+ // scalar broadcast from memory
+ VBROADCAST_LOAD,
+
// Store FP control world into i16 memory.
FNSTCW16m,
@@ -680,6 +687,9 @@ namespace llvm {
bool isCalleePop(CallingConv::ID CallingConv,
bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
+ /// If Op is a constant whose elements are all the same constant or
+ /// undefined, return true and return the constant value in \p SplatVal.
+ bool isConstantSplat(SDValue Op, APInt &SplatVal);
} // end namespace X86
//===--------------------------------------------------------------------===//
@@ -792,6 +802,17 @@ namespace llvm {
/// and some i16 instructions are slow.
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
+ /// Return 1 if we can compute the negated form of the specified expression
+ /// for the same cost as the expression itself, or 2 if we can compute the
+ /// negated form more cheaply than the expression itself. Else return 0.
+ char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations,
+ bool ForCodeSize, unsigned Depth) const override;
+
+ /// If isNegatibleForFree returns true, return the newly negated expression.
+ SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+ bool LegalOperations, bool ForCodeSize,
+ unsigned Depth) const override;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;
@@ -840,6 +861,13 @@ namespace llvm {
bool hasAndNot(SDValue Y) const override;
+ bool hasBitTest(SDValue X, SDValue Y) const override;
+
+ bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+ unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+ SelectionDAG &DAG) const override;
+
bool shouldFoldConstantShiftPairToMask(const SDNode *N,
CombineLevel Level) const override;
@@ -863,11 +891,7 @@ namespace llvm {
return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
}
- bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
- if (DAG.getMachineFunction().getFunction().hasMinSize())
- return false;
- return true;
- }
+ bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
bool shouldSplatInsEltVarIndex(EVT VT) const override;
@@ -913,6 +937,10 @@ namespace llvm {
TargetLoweringOpt &TLO,
unsigned Depth) const override;
+ SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ SelectionDAG &DAG, unsigned Depth) const override;
+
const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
SDValue unwrapAddress(SDValue N) const override;
@@ -1090,11 +1118,12 @@ namespace llvm {
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
- bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const override;
+ bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
bool convertSelectOfConstantsToMath(EVT VT) const override;
- bool decomposeMulByConstant(EVT VT, SDValue C) const override;
+ bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const override;
bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
bool IsSigned) const override;
@@ -1136,8 +1165,8 @@ namespace llvm {
return nullptr; // nothing to do, move along.
}
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.
@@ -1189,12 +1218,18 @@ namespace llvm {
CallingConv::ID CC,
EVT VT) const override;
+ unsigned getVectorTypeBreakdownForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+ unsigned &NumIntermediates, MVT &RegisterVT) const override;
+
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
bool supportSwiftError() const override;
StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
+ unsigned getStackProbeSize(MachineFunction &MF) const;
+
bool hasVectorBlend() const override { return true; }
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
@@ -1326,6 +1361,12 @@ namespace llvm {
SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+ RTLIB::Libcall Call) const;
SDValue
LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -1372,6 +1413,9 @@ namespace llvm {
LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+ bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override;
+ bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override;
+
bool needsCmpXchgNb(Type *MemType) const;
void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
@@ -1462,6 +1506,9 @@ namespace llvm {
/// Reassociate floating point divisions into multiply by reciprocal.
unsigned combineRepeatedFPDivisors() const override;
+
+ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const override;
};
namespace X86 {
@@ -1625,24 +1672,24 @@ namespace llvm {
/// mask. This is the reverse process to canWidenShuffleElements, but can
/// always succeed.
template <typename T>
- void scaleShuffleMask(int Scale, ArrayRef<T> Mask,
+ void scaleShuffleMask(size_t Scale, ArrayRef<T> Mask,
SmallVectorImpl<T> &ScaledMask) {
assert(0 < Scale && "Unexpected scaling factor");
size_t NumElts = Mask.size();
ScaledMask.assign(NumElts * Scale, -1);
- for (int i = 0; i != (int)NumElts; ++i) {
+ for (size_t i = 0; i != NumElts; ++i) {
int M = Mask[i];
// Repeat sentinel values in every mask element.
if (M < 0) {
- for (int s = 0; s != Scale; ++s)
+ for (size_t s = 0; s != Scale; ++s)
ScaledMask[(Scale * i) + s] = M;
continue;
}
// Scale mask element and increment across each mask element.
- for (int s = 0; s != Scale; ++s)
+ for (size_t s = 0; s != Scale; ++s)
ScaledMask[(Scale * i) + s] = (Scale * M) + s;
}
}
diff --git a/lib/Target/X86/X86IndirectBranchTracking.cpp b/lib/Target/X86/X86IndirectBranchTracking.cpp
index 04e8b2231fec..cc0f59ab329d 100644
--- a/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -84,7 +84,7 @@ bool X86IndirectBranchTrackingPass::addENDBR(
return false;
}
-bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
+static bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
if (!MOp.isGlobal())
return false;
auto *CalleeFn = dyn_cast<Function>(MOp.getGlobal());
diff --git a/lib/Target/X86/X86InsertPrefetch.cpp b/lib/Target/X86/X86InsertPrefetch.cpp
index 02ae73706a34..2b1e3f23efd7 100644
--- a/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/lib/Target/X86/X86InsertPrefetch.cpp
@@ -79,8 +79,8 @@ ErrorOr<PrefetchHints> getPrefetchHints(const FunctionSamples *TopSamples,
// The prefetch instruction can't take memory operands involving vector
// registers.
bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) {
- unsigned BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg();
- unsigned IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg();
+ Register BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg();
+ Register IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg();
return (BaseReg == 0 ||
X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) ||
X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) &&
@@ -108,7 +108,7 @@ bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
Prefetches &Prefetches) const {
assert(Prefetches.empty() &&
"Expected caller passed empty PrefetchInfo vector.");
- static const std::pair<const StringRef, unsigned> HintTypes[] = {
+ static constexpr std::pair<StringLiteral, unsigned> HintTypes[] = {
{"_nta_", X86::PREFETCHNTA},
{"_t0_", X86::PREFETCHT0},
{"_t1_", X86::PREFETCHT1},
@@ -173,7 +173,7 @@ bool X86InsertPrefetch::doInitialization(Module &M) {
void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
- AU.addRequired<MachineModuleInfo>();
+ AU.addRequired<MachineModuleInfoWrapperPass>();
}
bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 54eddeacaa17..9b5de59430a5 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -74,6 +74,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # VTName);
PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
+ PatFrag BroadcastLdFrag = !cast<PatFrag>("X86VBroadcastld" # EltSizeName);
ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"),
!cast<ComplexPattern>("sse_load_f32"),
@@ -412,6 +413,14 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
[(set VR512:$dst, (v16i32 immAllOnesV))]>;
}
+let Predicates = [HasAVX512] in {
+def : Pat<(v64i8 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v32i16 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
+}
+
// Alias instructions that allow VPTERNLOG to be used with a mask to create
// a mix of all ones and all zeros elements. This is done this way to force
// the same register to be used as input for all three sources.
@@ -436,6 +445,19 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
[(set VR256X:$dst, (v8i32 immAllZerosV))]>;
}
+let Predicates = [HasAVX512] in {
+def : Pat<(v8i16 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v4f32 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v2f64 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v32i8 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v16i16 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v4i64 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>;
+}
+
// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
// This is expanded by ExpandPostRAPseudos.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -443,7 +465,9 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
[(set FR32X:$dst, fp32imm0)]>;
def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
- [(set FR64X:$dst, fpimm0)]>;
+ [(set FR64X:$dst, fp64imm0)]>;
+ def AVX512_FsFLD0F128 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
+ [(set VR128X:$dst, fp128imm0)]>;
}
//===----------------------------------------------------------------------===//
@@ -730,14 +754,14 @@ let isCommutable = 1 in
def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+ [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, timm:$src3))]>,
EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
(ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR128X:$dst, (X86insertps VR128X:$src1,
(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
- imm:$src3))]>,
+ timm:$src3))]>,
EVEX_4V, EVEX_CD8<32, CD8VT1>,
Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
}
@@ -1100,75 +1124,104 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
X86VectorVTInfo MaskInfo,
X86VectorVTInfo DestInfo,
X86VectorVTInfo SrcInfo,
- SDPatternOperator UnmaskedOp = X86VBroadcast> {
- let ExeDomain = DestInfo.ExeDomain, hasSideEffects = 0 in {
- defm r : AVX512_maskable_split<opc, MRMSrcReg, MaskInfo,
- (outs MaskInfo.RC:$dst),
- (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT
- (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))),
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT
- (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))))>,
- T8PD, EVEX, Sched<[SchedRR]>;
- let mayLoad = 1 in
- defm m : AVX512_maskable_split<opc, MRMSrcMem, MaskInfo,
- (outs MaskInfo.RC:$dst),
- (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT (UnmaskedOp
- (SrcInfo.ScalarLdFrag addr:$src))))),
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT (X86VBroadcast
- (SrcInfo.ScalarLdFrag addr:$src)))))>,
- T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>,
- Sched<[SchedRM]>;
- }
-
- def : Pat<(MaskInfo.VT
- (bitconvert
- (DestInfo.VT (UnmaskedOp
- (SrcInfo.VT (scalar_to_vector
- (SrcInfo.ScalarLdFrag addr:$src))))))),
- (!cast<Instruction>(Name#MaskInfo.ZSuffix#m) addr:$src)>;
- def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
+ bit IsConvertibleToThreeAddress,
+ SDPatternOperator UnmaskedOp = X86VBroadcast,
+ SDPatternOperator UnmaskedBcastOp = SrcInfo.BroadcastLdFrag> {
+ let hasSideEffects = 0 in
+ def r : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))],
+ DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>;
+ def rkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+ "${dst} {${mask}} {z}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
(bitconvert
(DestInfo.VT
- (X86VBroadcast
- (SrcInfo.VT (scalar_to_vector
- (SrcInfo.ScalarLdFrag addr:$src)))))),
- MaskInfo.RC:$src0)),
- (!cast<Instruction>(Name#DestInfo.ZSuffix#mk)
- MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>;
- def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
+ (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+ MaskInfo.ImmAllZerosV))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
+ let Constraints = "$src0 = $dst" in
+ def rk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
+ SrcInfo.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+ MaskInfo.RC:$src0))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>;
+
+ let hasSideEffects = 0, mayLoad = 1 in
+ def m : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+ (ins SrcInfo.ScalarMemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (UnmaskedBcastOp addr:$src)))))],
+ DestInfo.ExeDomain>, T8PD, EVEX,
+ EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+
+ def mkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+ "${dst} {${mask}} {z}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
(bitconvert
(DestInfo.VT
- (X86VBroadcast
- (SrcInfo.VT (scalar_to_vector
- (SrcInfo.ScalarLdFrag addr:$src)))))),
- MaskInfo.ImmAllZerosV)),
- (!cast<Instruction>(Name#MaskInfo.ZSuffix#mkz)
- MaskInfo.KRCWM:$mask, addr:$src)>;
+ (SrcInfo.BroadcastLdFrag addr:$src)))),
+ MaskInfo.ImmAllZerosV))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ,
+ EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+
+ let Constraints = "$src0 = $dst",
+ isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
+ def mk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
+ SrcInfo.ScalarMemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (SrcInfo.BroadcastLdFrag addr:$src)))),
+ MaskInfo.RC:$src0))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K,
+ EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
}
// Helper class to force mask and broadcast result to same type.
multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, string Name,
SchedWrite SchedRR, SchedWrite SchedRM,
X86VectorVTInfo DestInfo,
- X86VectorVTInfo SrcInfo> :
+ X86VectorVTInfo SrcInfo,
+ bit IsConvertibleToThreeAddress> :
avx512_broadcast_rm_split<opc, OpcodeStr, Name, SchedRR, SchedRM,
- DestInfo, DestInfo, SrcInfo>;
+ DestInfo, DestInfo, SrcInfo,
+ IsConvertibleToThreeAddress>;
multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in {
defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
- WriteFShuffle256Ld, _.info512, _.info128>,
+ WriteFShuffle256Ld, _.info512, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
_.info128>,
EVEX_V512;
@@ -1176,7 +1229,7 @@ multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
let Predicates = [HasVLX] in {
defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
- WriteFShuffle256Ld, _.info256, _.info128>,
+ WriteFShuffle256Ld, _.info256, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
_.info128>,
EVEX_V256;
@@ -1187,7 +1240,7 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in {
defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
- WriteFShuffle256Ld, _.info512, _.info128>,
+ WriteFShuffle256Ld, _.info512, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
_.info128>,
EVEX_V512;
@@ -1195,12 +1248,12 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
let Predicates = [HasVLX] in {
defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
- WriteFShuffle256Ld, _.info256, _.info128>,
+ WriteFShuffle256Ld, _.info256, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
_.info128>,
EVEX_V256;
defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
- WriteFShuffle256Ld, _.info128, _.info128>,
+ WriteFShuffle256Ld, _.info128, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info128,
_.info128>,
EVEX_V128;
@@ -1284,46 +1337,35 @@ defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info,
defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
X86VBroadcast, GR64, HasAVX512>, VEX_W;
-// Provide aliases for broadcast from the same register class that
-// automatically does the extract.
-multiclass avx512_int_broadcast_rm_lowering<string Name,
- X86VectorVTInfo DestInfo,
- X86VectorVTInfo SrcInfo,
- X86VectorVTInfo ExtInfo> {
- def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))),
- (!cast<Instruction>(Name#DestInfo.ZSuffix#"r")
- (ExtInfo.VT (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm)))>;
-}
-
multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
- AVX512VLVectorVTInfo _, Predicate prd> {
+ AVX512VLVectorVTInfo _, Predicate prd,
+ bit IsConvertibleToThreeAddress> {
let Predicates = [prd] in {
defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
- WriteShuffle256Ld, _.info512, _.info128>,
- avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info256, _.info128>,
+ WriteShuffle256Ld, _.info512, _.info128,
+ IsConvertibleToThreeAddress>,
EVEX_V512;
- // Defined separately to avoid redefinition.
- defm Z_Alt : avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info512, _.info128>;
}
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
- WriteShuffle256Ld, _.info256, _.info128>,
- avx512_int_broadcast_rm_lowering<NAME, _.info256, _.info256, _.info128>,
+ WriteShuffle256Ld, _.info256, _.info128,
+ IsConvertibleToThreeAddress>,
EVEX_V256;
defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle,
- WriteShuffleXLd, _.info128, _.info128>,
+ WriteShuffleXLd, _.info128, _.info128,
+ IsConvertibleToThreeAddress>,
EVEX_V128;
}
}
defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb",
- avx512vl_i8_info, HasBWI>;
+ avx512vl_i8_info, HasBWI, 0>;
defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
- avx512vl_i16_info, HasBWI>;
+ avx512vl_i16_info, HasBWI, 0>;
defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
- avx512vl_i32_info, HasAVX512>;
+ avx512vl_i32_info, HasAVX512, 1>;
defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
- avx512vl_i64_info, HasAVX512>, VEX_W1X;
+ avx512vl_i64_info, HasAVX512, 1>, VEX_W1X;
multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
@@ -1354,6 +1396,10 @@ let Predicates = [HasAVX512] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZm addr:$src)>;
+
+ // FIXME this is to handle aligned extloads from i8.
+ def : Pat<(v16i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VPBROADCASTDZm addr:$src)>;
}
let Predicates = [HasVLX] in {
@@ -1362,6 +1408,12 @@ let Predicates = [HasVLX] in {
(VPBROADCASTQZ128m addr:$src)>;
def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZ256m addr:$src)>;
+
+ // FIXME this is to handle aligned extloads from i8.
+ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VPBROADCASTDZ128m addr:$src)>;
+ def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VPBROADCASTDZ256m addr:$src)>;
}
let Predicates = [HasVLX, HasBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
@@ -1382,6 +1434,12 @@ let Predicates = [HasVLX, HasBWI] in {
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWZ256m addr:$src)>;
+
+ // FIXME this is to handle aligned extloads from i8.
+ def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
+ (VPBROADCASTWZ128m addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
+ (VPBROADCASTWZ256m addr:$src)>;
}
let Predicates = [HasBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
@@ -1394,6 +1452,10 @@ let Predicates = [HasBWI] in {
def : Pat<(v32i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWZm addr:$src)>;
+
+ // FIXME this is to handle aligned extloads from i8.
+ def : Pat<(v32i16 (X86VBroadcast (loadi16 addr:$src))),
+ (VPBROADCASTWZm addr:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -1629,12 +1691,12 @@ multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
let Predicates = [HasDQI] in
defm Z : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
WriteShuffle256Ld, _Dst.info512,
- _Src.info512, _Src.info128, null_frag>,
+ _Src.info512, _Src.info128, 0, null_frag, null_frag>,
EVEX_V512;
let Predicates = [HasDQI, HasVLX] in
defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
WriteShuffle256Ld, _Dst.info256,
- _Src.info256, _Src.info128, null_frag>,
+ _Src.info256, _Src.info128, 0, null_frag, null_frag>,
EVEX_V256;
}
@@ -1645,7 +1707,7 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
let Predicates = [HasDQI, HasVLX] in
defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle,
WriteShuffleXLd, _Dst.info128,
- _Src.info128, _Src.info128, null_frag>,
+ _Src.info128, _Src.info128, 0, null_frag, null_frag>,
EVEX_V128;
}
@@ -1654,23 +1716,6 @@ defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
avx512vl_f32_info, avx512vl_f64_info>;
-let Predicates = [HasVLX] in {
-def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))),
- (VBROADCASTSSZ256r (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>;
-def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))),
- (VBROADCASTSDZ256r (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>;
-}
-
-def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
- (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))>;
-def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))),
- (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>;
-
-def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
- (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))>;
-def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
- (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>;
-
//===----------------------------------------------------------------------===//
// AVX-512 BROADCAST MASK TO VECTOR REGISTER
//---
@@ -1730,7 +1775,7 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermt2 _.RC:$src2,
- IdxVT.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>,
+ IdxVT.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
AVX5128IBase, EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -1807,7 +1852,7 @@ multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _,
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(X86VPermt2 _.RC:$src2,
(IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))),
- (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+ (_.BroadcastLdFrag addr:$src3)),
(_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
(!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3)>;
@@ -1846,7 +1891,7 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermt2 _.RC:$src1,
- IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>,
+ IdxVT.RC:$src2,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
AVX5128IBase, EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -1947,7 +1992,7 @@ multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr,
}
multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let mayLoad = 1, hasSideEffects = 0 in {
+ let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in {
def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
@@ -2031,9 +2076,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, $src2, $src1", "$src1, $src2, $cc",
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
(OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- imm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>;
+ timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>;
let mayLoad = 1 in
defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
@@ -2041,9 +2086,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
"vcmp"#_.Suffix,
"$cc, $src2, $src1", "$src1, $src2, $cc",
(OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
- imm:$cc),
+ timm:$cc),
(OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
- imm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
+ timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
@@ -2052,9 +2097,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
"vcmp"#_.Suffix,
"$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc",
(OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- imm:$cc),
+ timm:$cc),
(OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- imm:$cc)>,
+ timm:$cc)>,
EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
let isCodeGenOnly = 1 in {
@@ -2065,7 +2110,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
"\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
_.FRC:$src2,
- imm:$cc))]>,
+ timm:$cc))]>,
EVEX_4V, VEX_LIG, Sched<[sched]>;
def rm : AVX512Ii8<0xC2, MRMSrcMem,
(outs _.KRC:$dst),
@@ -2074,7 +2119,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
"\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2),
- imm:$cc))]>,
+ timm:$cc))]>,
EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -2100,94 +2145,82 @@ let Predicates = [HasAVX512] in {
SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W;
}
-multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- PatFrag OpNode_su, X86FoldableSchedWrite sched,
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, bit IsCommutable> {
- let isCommutable = IsCommutable in
+ let isCommutable = IsCommutable, hasSideEffects = 0 in
def rr : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))]>,
- EVEX_4V, Sched<[sched]>;
+ []>, EVEX_4V, Sched<[sched]>;
+ let mayLoad = 1, hasSideEffects = 0 in
def rm : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
- (_.VT (_.LdFrag addr:$src2))))]>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
- let isCommutable = IsCommutable in
+ []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ let isCommutable = IsCommutable, hasSideEffects = 0 in
def rrk : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
- [(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>,
- EVEX_4V, EVEX_K, Sched<[sched]>;
+ []>, EVEX_4V, EVEX_K, Sched<[sched]>;
+ let mayLoad = 1, hasSideEffects = 0 in
def rmk : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
- [(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode_su (_.VT _.RC:$src1),
- (_.VT (_.LdFrag addr:$src2)))))]>,
- EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ []>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- PatFrag OpNode_su,
+multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
bit IsCommutable> :
- avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched, _, IsCommutable> {
+ avx512_icmp_packed<opc, OpcodeStr, sched, _, IsCommutable> {
+ let mayLoad = 1, hasSideEffects = 0 in {
def rmb : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
"|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
- [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
- (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>,
- EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ []>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmbk : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
_.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
- [(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode_su (_.VT _.RC:$src1),
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)))))]>,
- EVEX_4V, EVEX_K, EVEX_B,
+ []>, EVEX_4V, EVEX_K, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
}
-multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- PatFrag OpNode_su, X86SchedWriteWidths sched,
+multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo, Predicate prd,
bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM,
+ defm Z : avx512_icmp_packed<opc, OpcodeStr, sched.ZMM,
VTInfo.info512, IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM,
+ defm Z256 : avx512_icmp_packed<opc, OpcodeStr, sched.YMM,
VTInfo.info256, IsCommutable>, EVEX_V256;
- defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM,
+ defm Z128 : avx512_icmp_packed<opc, OpcodeStr, sched.XMM,
VTInfo.info128, IsCommutable>, EVEX_V128;
}
}
multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
- PatFrag OpNode, PatFrag OpNode_su,
X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo,
Predicate prd, bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM,
+ defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.ZMM,
VTInfo.info512, IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM,
+ defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.YMM,
VTInfo.info256, IsCommutable>, EVEX_V256;
- defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM,
+ defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.XMM,
VTInfo.info128, IsCommutable>, EVEX_V128;
}
}
@@ -2195,53 +2228,42 @@ multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
// This fragment treats X86cmpm as commutable to help match loads in both
// operands for PCMPEQ.
def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>;
-def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2),
- (X86setcc_commute node:$src1, node:$src2, SETEQ)>;
def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2),
(setcc node:$src1, node:$src2, SETGT)>;
-def X86pcmpeqm_c_su : PatFrag<(ops node:$src1, node:$src2),
- (X86pcmpeqm_c node:$src1, node:$src2), [{
- return N->hasOneUse();
-}]>;
-def X86pcmpgtm_su : PatFrag<(ops node:$src1, node:$src2),
- (X86pcmpgtm node:$src1, node:$src2), [{
- return N->hasOneUse();
-}]>;
-
// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
// increase the pattern complexity the way an immediate would.
let AddedComplexity = 2 in {
// FIXME: Is there a better scheduler class for VPCMP?
-defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, X86pcmpeqm_c_su,
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb",
SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>,
EVEX_CD8<8, CD8VF>, VEX_WIG;
-defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, X86pcmpeqm_c_su,
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw",
SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>,
EVEX_CD8<16, CD8VF>, VEX_WIG;
-defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, X86pcmpeqm_c_su,
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd",
SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>,
EVEX_CD8<32, CD8VF>;
-defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, X86pcmpeqm_c_su,
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq",
SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, X86pcmpgtm_su,
+defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb",
SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
EVEX_CD8<8, CD8VF>, VEX_WIG;
-defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, X86pcmpgtm_su,
+defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw",
SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
EVEX_CD8<16, CD8VF>, VEX_WIG;
-defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, X86pcmpgtm_su,
+defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd",
SchedWriteVecALU, avx512vl_i32_info, HasAVX512>,
EVEX_CD8<32, CD8VF>;
-defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, X86pcmpgtm_su,
+defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq",
SchedWriteVecALU, avx512vl_i64_info, HasAVX512>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
}
@@ -2322,8 +2344,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
"$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
[(set _.KRC:$dst, (_.KVT (Frag:$cc
(_.VT _.RC:$src1),
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)),
+ (_.BroadcastLdFrag addr:$src2),
cond)))]>,
EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmibk : AVX512AIi8<opc, MRMSrcMem,
@@ -2335,23 +2356,21 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
[(set _.KRC:$dst, (and _.KRCWM:$mask,
(_.KVT (Frag_su:$cc
(_.VT _.RC:$src1),
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)),
+ (_.BroadcastLdFrag addr:$src2),
cond))))]>,
EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
- def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+ def : Pat<(_.KVT (CommFrag:$cc (_.BroadcastLdFrag addr:$src2),
(_.VT _.RC:$src1), cond)),
(!cast<Instruction>(Name#_.ZSuffix#"rmib")
_.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
def : Pat<(and _.KRCWM:$mask,
- (_.KVT (CommFrag_su:$cc (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)),
+ (_.KVT (CommFrag_su:$cc (_.BroadcastLdFrag addr:$src2),
(_.VT _.RC:$src1), cond))),
(!cast<Instruction>(Name#_.ZSuffix#"rmibk")
_.KRCWM:$mask, _.RC:$src1, addr:$src2,
- (CommFrag.OperandTransform $cc))>;
+ (CommFrag_su.OperandTransform $cc))>;
}
multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag,
@@ -2496,14 +2515,19 @@ def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
return N->hasOneUse();
}]>;
+def X86cmpm_imm_commute : SDNodeXForm<timm, [{
+ uint8_t Imm = X86::getSwappedVCMPImm(N->getZExtValue() & 0x1f);
+ return getI8Imm(Imm, SDLoc(N));
+}]>;
+
multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
string Name> {
defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, $src2, $src1", "$src1, $src2, $cc",
- (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
- (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
+ (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
1>, Sched<[sched]>;
defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
@@ -2511,9 +2535,9 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
"vcmp"#_.Suffix,
"$cc, $src2, $src1", "$src1, $src2, $cc",
(X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
- imm:$cc),
+ timm:$cc),
(X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
- imm:$cc)>,
+ timm:$cc)>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
@@ -2523,38 +2547,37 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
"$cc, ${src2}"#_.BroadcastStr#", $src1",
"$src1, ${src2}"#_.BroadcastStr#", $cc",
(X86cmpm (_.VT _.RC:$src1),
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- imm:$cc),
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ timm:$cc),
(X86cmpm_su (_.VT _.RC:$src1),
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- imm:$cc)>,
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ timm:$cc)>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
// Patterns for selecting with loads in other operand.
def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
- CommutableCMPCC:$cc),
+ timm:$cc),
(!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
- imm:$cc)>;
+ (X86cmpm_imm_commute timm:$cc))>;
def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.LdFrag addr:$src2),
(_.VT _.RC:$src1),
- CommutableCMPCC:$cc)),
+ timm:$cc)),
(!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
_.RC:$src1, addr:$src2,
- imm:$cc)>;
+ (X86cmpm_imm_commute timm:$cc))>;
- def : Pat<(X86cmpm (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
- (_.VT _.RC:$src1), CommutableCMPCC:$cc),
+ def : Pat<(X86cmpm (_.BroadcastLdFrag addr:$src2),
+ (_.VT _.RC:$src1), timm:$cc),
(!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
- imm:$cc)>;
+ (X86cmpm_imm_commute timm:$cc))>;
- def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)),
+ def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.BroadcastLdFrag addr:$src2),
(_.VT _.RC:$src1),
- CommutableCMPCC:$cc)),
+ timm:$cc)),
(!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
_.RC:$src1, addr:$src2,
- imm:$cc)>;
+ (X86cmpm_imm_commute timm:$cc))>;
}
multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
@@ -2564,9 +2587,9 @@ multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
"vcmp"#_.Suffix,
"$cc, {sae}, $src2, $src1",
"$src1, $src2, {sae}, $cc",
- (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
(X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- imm:$cc)>,
+ timm:$cc)>,
EVEX_B, Sched<[sched]>;
}
@@ -2590,12 +2613,12 @@ defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>,
// Patterns to select fp compares with load as first operand.
let Predicates = [HasAVX512] in {
def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1,
- CommutableCMPCC:$cc)),
- (VCMPSDZrm FR64X:$src1, addr:$src2, imm:$cc)>;
+ timm:$cc)),
+ (VCMPSDZrm FR64X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1,
- CommutableCMPCC:$cc)),
- (VCMPSSZrm FR32X:$src1, addr:$src2, imm:$cc)>;
+ timm:$cc)),
+ (VCMPSSZrm FR32X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
}
// ----------------------------------------------------------------
@@ -2621,7 +2644,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1),
- (i32 imm:$src2)))]>,
+ (i32 timm:$src2)))]>,
Sched<[sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
@@ -2629,7 +2652,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
(X86Vfpclasss_su (_.VT _.RC:$src1),
- (i32 imm:$src2))))]>,
+ (i32 timm:$src2))))]>,
EVEX_K, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.IntScalarMemOp:$src1, i32u8imm:$src2),
@@ -2637,7 +2660,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,
(X86Vfpclasss _.ScalarIntMemCPat:$src1,
- (i32 imm:$src2)))]>,
+ (i32 timm:$src2)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
@@ -2645,7 +2668,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
(X86Vfpclasss_su _.ScalarIntMemCPat:$src1,
- (i32 imm:$src2))))]>,
+ (i32 timm:$src2))))]>,
EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -2661,7 +2684,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1),
- (i32 imm:$src2)))]>,
+ (i32 timm:$src2)))]>,
Sched<[sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
@@ -2669,7 +2692,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
(X86Vfpclass_su (_.VT _.RC:$src1),
- (i32 imm:$src2))))]>,
+ (i32 timm:$src2))))]>,
EVEX_K, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.MemOp:$src1, i32u8imm:$src2),
@@ -2677,7 +2700,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(X86Vfpclass
(_.VT (_.LdFrag addr:$src1)),
- (i32 imm:$src2)))]>,
+ (i32 timm:$src2)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
@@ -2685,7 +2708,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst, (and _.KRCWM:$mask, (X86Vfpclass_su
(_.VT (_.LdFrag addr:$src1)),
- (i32 imm:$src2))))]>,
+ (i32 timm:$src2))))]>,
EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
@@ -2693,9 +2716,8 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
_.BroadcastStr##", $dst|$dst, ${src1}"
##_.BroadcastStr##", $src2}",
[(set _.KRC:$dst,(X86Vfpclass
- (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src1))),
- (i32 imm:$src2)))]>,
+ (_.VT (_.BroadcastLdFrag addr:$src1)),
+ (i32 timm:$src2)))]>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
@@ -2703,9 +2725,8 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
_.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
_.BroadcastStr##", $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su
- (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src1))),
- (i32 imm:$src2))))]>,
+ (_.VT (_.BroadcastLdFrag addr:$src1)),
+ (i32 timm:$src2))))]>,
EVEX_B, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -2836,13 +2857,21 @@ def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
(KMOVWrk VK16:$src)>;
+def : Pat<(i64 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
+ (SUBREG_TO_REG (i64 0), (KMOVWrk VK16:$src), sub_32bit)>;
def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
(COPY_TO_REGCLASS VK16:$src, GR32)>;
+def : Pat<(i64 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
+ (INSERT_SUBREG (IMPLICIT_DEF), (COPY_TO_REGCLASS VK16:$src, GR32), sub_32bit)>;
def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
(KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
+def : Pat<(i64 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (SUBREG_TO_REG (i64 0), (KMOVBrk VK8:$src), sub_32bit)>, Requires<[HasDQI]>;
def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
(COPY_TO_REGCLASS VK8:$src, GR32)>;
+def : Pat<(i64 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (INSERT_SUBREG (IMPLICIT_DEF), (COPY_TO_REGCLASS VK8:$src, GR32), sub_32bit)>;
def : Pat<(v32i1 (bitconvert (i32 GR32:$src))),
(COPY_TO_REGCLASS GR32:$src, VK32)>;
@@ -3075,7 +3104,7 @@ multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
!strconcat(OpcodeStr,
"\t{$imm, $src, $dst|$dst, $src, $imm}"),
- [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>,
+ [(set KRC:$dst, (OpNode KRC:$src, (i8 timm:$imm)))]>,
Sched<[sched]>;
}
@@ -3098,30 +3127,6 @@ defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShu
defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>;
// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
-multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
- string InstStr,
- X86VectorVTInfo Narrow,
- X86VectorVTInfo Wide> {
- def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(InstStr#"Zrr")
- (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
- (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))),
- Narrow.KRC)>;
-
- def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
- (Frag_su (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2)))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(InstStr#"Zrrk")
- (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
- (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
- (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))),
- Narrow.KRC)>;
-}
-
-// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
string InstStr,
X86VectorVTInfo Narrow,
@@ -3129,7 +3134,7 @@ multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
(Narrow.VT Narrow.RC:$src2), cond)),
(COPY_TO_REGCLASS
- (!cast<Instruction>(InstStr##Zrri)
+ (!cast<Instruction>(InstStr#"Zrri")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
(Frag.OperandTransform $cc)), Narrow.KRC)>;
@@ -3138,53 +3143,111 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
(Narrow.KVT (Frag_su:$cc (Narrow.VT Narrow.RC:$src1),
(Narrow.VT Narrow.RC:$src2),
cond)))),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrik")
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
- (Frag.OperandTransform $cc)), Narrow.KRC)>;
+ (Frag_su.OperandTransform $cc)), Narrow.KRC)>;
+}
+
+multiclass axv512_icmp_packed_cc_rmb_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
+ PatFrag CommFrag, PatFrag CommFrag_su,
+ string InstStr,
+ X86VectorVTInfo Narrow,
+ X86VectorVTInfo Wide> {
+// Broadcast load.
+def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
+ (Narrow.BroadcastLdFrag addr:$src2), cond)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrmib")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (Frag.OperandTransform $cc)), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (Narrow.KVT
+ (Frag_su:$cc (Narrow.VT Narrow.RC:$src1),
+ (Narrow.BroadcastLdFrag addr:$src2),
+ cond)))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (Frag_su.OperandTransform $cc)), Narrow.KRC)>;
+
+// Commuted with broadcast load.
+def : Pat<(Narrow.KVT (CommFrag:$cc (Narrow.BroadcastLdFrag addr:$src2),
+ (Narrow.VT Narrow.RC:$src1),
+ cond)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrmib")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (CommFrag.OperandTransform $cc)), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (Narrow.KVT
+ (CommFrag_su:$cc (Narrow.BroadcastLdFrag addr:$src2),
+ (Narrow.VT Narrow.RC:$src1),
+ cond)))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (CommFrag_su.OperandTransform $cc)), Narrow.KRC)>;
}
// Same as above, but for fp types which don't use PatFrags.
-multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, PatFrag OpNode_su,
- string InstStr,
+multiclass axv512_cmp_packed_cc_no_vlx_lowering<string InstStr,
X86VectorVTInfo Narrow,
X86VectorVTInfo Wide> {
-def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2), imm:$cc)),
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), timm:$cc)),
(COPY_TO_REGCLASS
- (!cast<Instruction>(InstStr##Zrri)
+ (!cast<Instruction>(InstStr#"Zrri")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
- imm:$cc), Narrow.KRC)>;
+ timm:$cc), Narrow.KRC)>;
def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
- (OpNode_su (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2), imm:$cc))),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
+ (X86cmpm_su (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), timm:$cc))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrik")
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
- imm:$cc), Narrow.KRC)>;
-}
+ timm:$cc), Narrow.KRC)>;
-let Predicates = [HasAVX512, NoVLX] in {
- // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
- // increase the pattern complexity the way an immediate would.
- let AddedComplexity = 2 in {
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v8i32x_info, v16i32_info>;
+// Broadcast load.
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrmbi")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, timm:$cc), Narrow.KRC)>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v4i32x_info, v16i32_info>;
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (X86cmpm_su (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, timm:$cc), Narrow.KRC)>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v4i64x_info, v8i64_info>;
+// Commuted with broadcast load.
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
+ (Narrow.VT Narrow.RC:$src1), timm:$cc)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrmbi")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v2i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v2i64x_info, v8i64_info>;
- }
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (X86cmpm_su (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
+ (Narrow.VT Narrow.RC:$src1), timm:$cc))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
+}
+let Predicates = [HasAVX512, NoVLX] in {
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>;
@@ -3197,29 +3260,25 @@ let Predicates = [HasAVX512, NoVLX] in {
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v2i64x_info, v8i64_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v8f32x_info, v16f32_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v4f32x_info, v16f32_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v4f64x_info, v8f64_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v2f64x_info, v8f64_info>;
-}
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v8i32x_info, v16i32_info>;
-let Predicates = [HasBWI, NoVLX] in {
- // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
- // increase the pattern complexity the way an immediate would.
- let AddedComplexity = 2 in {
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v32i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v16i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v16i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v8i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v8i16x_info, v32i16_info>;
- }
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v8f32x_info, v16f32_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v4f32x_info, v16f32_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v4f64x_info, v8f64_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>;
+}
+let Predicates = [HasBWI, NoVLX] in {
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>;
@@ -4186,16 +4245,32 @@ def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), fp32imm0)),
(COPY_TO_REGCLASS (v4f32 (VMOVSSZrrkz VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),
(v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>;
+def : Pat<(f32 (X86selects VK1WM:$mask, (loadf32 addr:$src), (f32 FR32X:$src0))),
+ (COPY_TO_REGCLASS
+ (v4f32 (VMOVSSZrmk (v4f32 (COPY_TO_REGCLASS FR32X:$src0, VR128X)),
+ VK1WM:$mask, addr:$src)),
+ FR32X)>;
+def : Pat<(f32 (X86selects VK1WM:$mask, (loadf32 addr:$src), fp32imm0)),
+ (COPY_TO_REGCLASS (v4f32 (VMOVSSZrmkz VK1WM:$mask, addr:$src)), FR32X)>;
+
def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
(COPY_TO_REGCLASS (v2f64 (VMOVSDZrrk
(v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)),
VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
(v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>;
-def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fpimm0)),
+def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fp64imm0)),
(COPY_TO_REGCLASS (v2f64 (VMOVSDZrrkz VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
(v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>;
+def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), (f64 FR64X:$src0))),
+ (COPY_TO_REGCLASS
+ (v2f64 (VMOVSDZrmk (v2f64 (COPY_TO_REGCLASS FR64X:$src0, VR128X)),
+ VK1WM:$mask, addr:$src)),
+ FR64X)>;
+def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), fp64imm0)),
+ (COPY_TO_REGCLASS (v2f64 (VMOVSDZrmkz VK1WM:$mask, addr:$src)), FR64X)>;
+
let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
@@ -4537,8 +4612,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src1,
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))>,
+ (_.BroadcastLdFrag addr:$src2)))>,
AVX512BIBase, EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -4664,8 +4738,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
"${src2}"##_Brdct.BroadcastStr##", $src1",
"$src1, ${src2}"##_Brdct.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
- (_Brdct.VT (X86VBroadcast
- (_Brdct.ScalarLdFrag addr:$src2))))))>,
+ (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>,
AVX512BIBase, EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -4737,8 +4810,7 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
"${src2}"##_Src.BroadcastStr##", $src1",
"$src1, ${src2}"##_Src.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
- (_Src.VT (X86VBroadcast
- (_Src.ScalarLdFrag addr:$src2))))))>,
+ (_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>,
EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -4874,22 +4946,11 @@ let Predicates = [HasDQI, NoVLX] in {
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
sub_ymm)>;
-
- def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
- (EXTRACT_SUBREG
- (VPMULLQZrr
- (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
- (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
- sub_xmm)>;
-}
-
-// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
-let Predicates = [HasDQI, NoVLX] in {
- def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+ def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))),
(EXTRACT_SUBREG
- (VPMULLQZrr
+ (VPMULLQZrmb
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
- (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+ addr:$src2),
sub_ymm)>;
def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
@@ -4898,29 +4959,47 @@ let Predicates = [HasDQI, NoVLX] in {
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
sub_xmm)>;
+ def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))),
+ (EXTRACT_SUBREG
+ (VPMULLQZrmb
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ addr:$src2),
+ sub_xmm)>;
}
-multiclass avx512_min_max_lowering<Instruction Instr, SDNode OpNode> {
+multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> {
def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)),
(EXTRACT_SUBREG
- (Instr
+ (!cast<Instruction>(Instr#"rr")
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
sub_ymm)>;
+ def : Pat<(v4i64 (OpNode (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(Instr#"rmb")
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ addr:$src2),
+ sub_ymm)>;
def : Pat<(v2i64 (OpNode VR128X:$src1, VR128X:$src2)),
(EXTRACT_SUBREG
- (Instr
+ (!cast<Instruction>(Instr#"rr")
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
sub_xmm)>;
+ def : Pat<(v2i64 (OpNode (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(Instr#"rmb")
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ addr:$src2),
+ sub_xmm)>;
}
let Predicates = [HasAVX512, NoVLX] in {
- defm : avx512_min_max_lowering<VPMAXUQZrr, umax>;
- defm : avx512_min_max_lowering<VPMINUQZrr, umin>;
- defm : avx512_min_max_lowering<VPMAXSQZrr, smax>;
- defm : avx512_min_max_lowering<VPMINSQZrr, smin>;
+ defm : avx512_min_max_lowering<"VPMAXUQZ", umax>;
+ defm : avx512_min_max_lowering<"VPMINUQZ", umin>;
+ defm : avx512_min_max_lowering<"VPMAXSQZ", smax>;
+ defm : avx512_min_max_lowering<"VPMINSQZ", smin>;
}
//===----------------------------------------------------------------------===//
@@ -4977,32 +5056,6 @@ let Predicates = [HasVLX] in {
def : Pat<(X86andnp VR128X:$src1, (loadv8i16 addr:$src2)),
(VPANDNQZ128rm VR128X:$src1, addr:$src2)>;
- def : Pat<(and VR128X:$src1,
- (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPANDDZ128rmb VR128X:$src1, addr:$src2)>;
- def : Pat<(or VR128X:$src1,
- (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPORDZ128rmb VR128X:$src1, addr:$src2)>;
- def : Pat<(xor VR128X:$src1,
- (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPXORDZ128rmb VR128X:$src1, addr:$src2)>;
- def : Pat<(X86andnp VR128X:$src1,
- (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPANDNDZ128rmb VR128X:$src1, addr:$src2)>;
-
- def : Pat<(and VR128X:$src1,
- (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPANDQZ128rmb VR128X:$src1, addr:$src2)>;
- def : Pat<(or VR128X:$src1,
- (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPORQZ128rmb VR128X:$src1, addr:$src2)>;
- def : Pat<(xor VR128X:$src1,
- (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPXORQZ128rmb VR128X:$src1, addr:$src2)>;
- def : Pat<(X86andnp VR128X:$src1,
- (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPANDNQZ128rmb VR128X:$src1, addr:$src2)>;
-
def : Pat<(v32i8 (and VR256X:$src1, VR256X:$src2)),
(VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
def : Pat<(v16i16 (and VR256X:$src1, VR256X:$src2)),
@@ -5042,32 +5095,6 @@ let Predicates = [HasVLX] in {
(VPANDNQZ256rm VR256X:$src1, addr:$src2)>;
def : Pat<(X86andnp VR256X:$src1, (loadv16i16 addr:$src2)),
(VPANDNQZ256rm VR256X:$src1, addr:$src2)>;
-
- def : Pat<(and VR256X:$src1,
- (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPANDDZ256rmb VR256X:$src1, addr:$src2)>;
- def : Pat<(or VR256X:$src1,
- (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPORDZ256rmb VR256X:$src1, addr:$src2)>;
- def : Pat<(xor VR256X:$src1,
- (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPXORDZ256rmb VR256X:$src1, addr:$src2)>;
- def : Pat<(X86andnp VR256X:$src1,
- (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPANDNDZ256rmb VR256X:$src1, addr:$src2)>;
-
- def : Pat<(and VR256X:$src1,
- (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPANDQZ256rmb VR256X:$src1, addr:$src2)>;
- def : Pat<(or VR256X:$src1,
- (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPORQZ256rmb VR256X:$src1, addr:$src2)>;
- def : Pat<(xor VR256X:$src1,
- (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPXORQZ256rmb VR256X:$src1, addr:$src2)>;
- def : Pat<(X86andnp VR256X:$src1,
- (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPANDNQZ256rmb VR256X:$src1, addr:$src2)>;
}
let Predicates = [HasAVX512] in {
@@ -5110,32 +5137,6 @@ let Predicates = [HasAVX512] in {
(VPANDNQZrm VR512:$src1, addr:$src2)>;
def : Pat<(X86andnp VR512:$src1, (loadv32i16 addr:$src2)),
(VPANDNQZrm VR512:$src1, addr:$src2)>;
-
- def : Pat<(and VR512:$src1,
- (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPANDDZrmb VR512:$src1, addr:$src2)>;
- def : Pat<(or VR512:$src1,
- (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPORDZrmb VR512:$src1, addr:$src2)>;
- def : Pat<(xor VR512:$src1,
- (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPXORDZrmb VR512:$src1, addr:$src2)>;
- def : Pat<(X86andnp VR512:$src1,
- (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPANDNDZrmb VR512:$src1, addr:$src2)>;
-
- def : Pat<(and VR512:$src1,
- (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPANDQZrmb VR512:$src1, addr:$src2)>;
- def : Pat<(or VR512:$src1,
- (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPORQZrmb VR512:$src1, addr:$src2)>;
- def : Pat<(xor VR512:$src1,
- (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPXORQZrmb VR512:$src1, addr:$src2)>;
- def : Pat<(X86andnp VR512:$src1,
- (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPANDNQZrmb VR512:$src1, addr:$src2)>;
}
// Patterns to catch vselect with different type than logic op.
@@ -5174,25 +5175,17 @@ multiclass avx512_logical_lowering_bcast<string InstrStr, SDNode OpNode,
X86VectorVTInfo _,
X86VectorVTInfo IntInfo> {
// Register-broadcast logical operations.
- def : Pat<(IntInfo.VT (OpNode _.RC:$src1,
- (bitconvert (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)))))),
- (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(bitconvert
(IntInfo.VT (OpNode _.RC:$src1,
- (bitconvert (_.VT
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))))),
+ (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))),
_.RC:$src0)),
(!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
_.RC:$src1, addr:$src2)>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(bitconvert
(IntInfo.VT (OpNode _.RC:$src1,
- (bitconvert (_.VT
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))))),
+ (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))),
_.ImmAllZerosV)),
(!cast<Instruction>(InstrStr#rmbkz) _.KRCWM:$mask,
_.RC:$src1, addr:$src2)>;
@@ -5329,7 +5322,8 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
}
multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode OpNode, SDNode VecNode, SDNode SaeNode,
- X86FoldableSchedWrite sched, bit IsCommutable> {
+ X86FoldableSchedWrite sched, bit IsCommutable,
+ string EVEX2VexOvrd> {
let ExeDomain = _.ExeDomain in {
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -5349,7 +5343,8 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
- Sched<[sched]> {
+ Sched<[sched]>,
+ EVEX2VEXOverride<EVEX2VexOvrd#"rr"> {
let isCommutable = IsCommutable;
}
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
@@ -5357,7 +5352,8 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2)))]>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>,
+ EVEX2VEXOverride<EVEX2VexOvrd#"rm">;
}
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -5387,10 +5383,12 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
SDNode VecNode, SDNode SaeNode,
X86SchedWriteSizes sched, bit IsCommutable> {
defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
- VecNode, SaeNode, sched.PS.Scl, IsCommutable>,
+ VecNode, SaeNode, sched.PS.Scl, IsCommutable,
+ NAME#"SS">,
XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
- VecNode, SaeNode, sched.PD.Scl, IsCommutable>,
+ VecNode, SaeNode, sched.PD.Scl, IsCommutable,
+ NAME#"SD">,
XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
}
defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86fadds, X86faddRnds,
@@ -5410,13 +5408,14 @@ defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs,
// X86fminc and X86fmaxc instead of X86fmin and X86fmax
multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _, SDNode OpNode,
- X86FoldableSchedWrite sched> {
+ X86FoldableSchedWrite sched,
+ string EVEX2VEXOvrd> {
let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
- Sched<[sched]> {
+ Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr"> {
let isCommutable = 1;
}
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
@@ -5424,24 +5423,27 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2)))]>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
}
}
defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
- SchedWriteFCmp.Scl>, XS, EVEX_4V,
- VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ SchedWriteFCmp.Scl, "VMINCSS">, XS,
+ EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
- SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V,
- VEX_LIG, EVEX_CD8<64, CD8VT1>;
+ SchedWriteFCmp.Scl, "VMINCSD">, XD,
+ VEX_W, EVEX_4V, VEX_LIG,
+ EVEX_CD8<64, CD8VT1>;
defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
- SchedWriteFCmp.Scl>, XS, EVEX_4V,
- VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ SchedWriteFCmp.Scl, "VMAXCSS">, XS,
+ EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
- SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V,
- VEX_LIG, EVEX_CD8<64, CD8VT1>;
+ SchedWriteFCmp.Scl, "VMAXCSD">, XD,
+ VEX_W, EVEX_4V, VEX_LIG,
+ EVEX_CD8<64, CD8VT1>;
multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
X86VectorVTInfo _, X86FoldableSchedWrite sched,
@@ -5464,8 +5466,7 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
- (OpNode _.RC:$src1, (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))>,
+ (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -5595,8 +5596,7 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
- (OpNode _.RC:$src1, (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))>,
+ (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -5751,13 +5751,13 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
(ins _.RC:$src1, u8imm:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, (i8 imm:$src2)))>,
+ (_.VT (OpNode _.RC:$src1, (i8 timm:$src2)))>,
Sched<[sched]>;
defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
(ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT (_.LdFrag addr:$src1)),
- (i8 imm:$src2)))>,
+ (i8 timm:$src2)))>,
Sched<[sched.Folded]>;
}
}
@@ -5769,7 +5769,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
"$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
- (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2)))>,
+ (_.VT (OpNode (_.BroadcastLdFrag addr:$src1), (i8 timm:$src2)))>,
EVEX_B, Sched<[sched.Folded]>;
}
@@ -5911,17 +5911,17 @@ let Predicates = [HasAVX512, NoVLX] in {
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
VR128X:$src2)), sub_xmm)>;
- def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 imm:$src2))),
+ def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v8i64
(VPSRAQZri
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- imm:$src2)), sub_ymm)>;
+ timm:$src2)), sub_ymm)>;
- def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 imm:$src2))),
+ def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v8i64
(VPSRAQZri
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- imm:$src2)), sub_xmm)>;
+ timm:$src2)), sub_xmm)>;
}
//===-------------------------------------------------------------------===//
@@ -5953,8 +5953,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
- (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)))))>,
+ (_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>,
AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -6062,27 +6061,27 @@ let Predicates = [HasAVX512, NoVLX] in {
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
sub_ymm)>;
- def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))),
+ def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v8i64
(VPROLQZri
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- imm:$src2)), sub_xmm)>;
- def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))),
+ timm:$src2)), sub_xmm)>;
+ def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v8i64
(VPROLQZri
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- imm:$src2)), sub_ymm)>;
+ timm:$src2)), sub_ymm)>;
- def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))),
+ def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v16i32
(VPROLDZri
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- imm:$src2)), sub_xmm)>;
- def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))),
+ timm:$src2)), sub_xmm)>;
+ def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v16i32
(VPROLDZri
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- imm:$src2)), sub_ymm)>;
+ timm:$src2)), sub_ymm)>;
}
// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
@@ -6113,27 +6112,27 @@ let Predicates = [HasAVX512, NoVLX] in {
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
sub_ymm)>;
- def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))),
+ def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v8i64
(VPRORQZri
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- imm:$src2)), sub_xmm)>;
- def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))),
+ timm:$src2)), sub_xmm)>;
+ def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v8i64
(VPRORQZri
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- imm:$src2)), sub_ymm)>;
+ timm:$src2)), sub_ymm)>;
- def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))),
+ def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v16i32
(VPRORDZri
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- imm:$src2)), sub_xmm)>;
- def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))),
+ timm:$src2)), sub_xmm)>;
+ def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v16i32
(VPRORDZri
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- imm:$src2)), sub_ymm)>;
+ timm:$src2)), sub_ymm)>;
}
//===-------------------------------------------------------------------===//
@@ -6228,8 +6227,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
"$src1, ${src2}"##_.BroadcastStr,
(_.VT (OpNode
_.RC:$src1,
- (Ctrl.VT (X86VBroadcast
- (Ctrl.ScalarLdFrag addr:$src2)))))>,
+ (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>,
T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -6419,7 +6417,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(OpNode _.RC:$src2,
- _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
+ _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>,
AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -6493,7 +6491,7 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
"$src2, ${src3}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src2,
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ (_.VT (_.BroadcastLdFrag addr:$src3)),
_.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -6571,7 +6569,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
"$src2, ${src3}"##_.BroadcastStr,
- (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
_.RC:$src1, _.RC:$src2)), 1, 0>,
AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -6964,7 +6962,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(OpNode _.RC:$src2,
- (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+ (_.VT (_.BroadcastLdFrag addr:$src3)),
_.RC:$src1)>,
AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -7504,14 +7502,13 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
OpcodeStr,
"${src}"##Broadcast, "${src}"##Broadcast,
(_.VT (OpNode (_Src.VT
- (X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
+ (_Src.BroadcastLdFrag addr:$src))
)),
(vselect MaskRC:$mask,
(_.VT
(OpNode
(_Src.VT
- (X86VBroadcast
- (_Src.ScalarLdFrag addr:$src))))),
+ (_Src.BroadcastLdFrag addr:$src)))),
_.RC:$src0),
vselect, "$src0 = $dst">,
EVEX, EVEX_B, Sched<[sched.Folded]>;
@@ -7646,14 +7643,14 @@ let Predicates = [HasAVX512] in {
v8f32x_info.ImmAllZerosV),
(VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>;
- def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src))))),
+ def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcastld64 addr:$src)))),
(VCVTPD2PSZrmb addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
- (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))),
+ (fpround (v8f64 (X86VBroadcastld64 addr:$src))),
(v8f32 VR256X:$src0)),
(VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
- (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))),
+ (fpround (v8f64 (X86VBroadcastld64 addr:$src))),
v8f32x_info.ImmAllZerosV),
(VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>;
}
@@ -7677,14 +7674,14 @@ let Predicates = [HasVLX] in {
v4f32x_info.ImmAllZerosV),
(VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>;
- def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+ def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))),
(VCVTPD2PSZ256rmb addr:$src)>;
def : Pat<(vselect VK4WM:$mask,
- (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+ (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))),
VR128X:$src0),
(VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
def : Pat<(vselect VK4WM:$mask,
- (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+ (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))),
v4f32x_info.ImmAllZerosV),
(VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>;
@@ -7708,12 +7705,12 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(X86vfpround (v2f64 (X86VBroadcast (loadf64 addr:$src)))),
+ def : Pat<(X86vfpround (v2f64 (X86VBroadcastld64 addr:$src))),
(VCVTPD2PSZ128rmb addr:$src)>;
- def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
(v4f32 VR128X:$src0), VK2WM:$mask),
(VCVTPD2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
v4f32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTPD2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
}
@@ -8194,12 +8191,12 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))))),
+ def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)))),
(VCVTPD2DQZ128rmb addr:$src)>;
- def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)),
(v4i32 VR128X:$src0), VK2WM:$mask),
(VCVTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)),
v4i32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>;
@@ -8223,12 +8220,12 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))))),
+ def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcastld64 addr:$src)))),
(VCVTTPD2DQZ128rmb addr:$src)>;
- def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)),
(v4i32 VR128X:$src0), VK2WM:$mask),
(VCVTTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)),
v4i32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>;
@@ -8252,12 +8249,12 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))))),
+ def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)))),
(VCVTPD2UDQZ128rmb addr:$src)>;
- def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)),
(v4i32 VR128X:$src0), VK2WM:$mask),
(VCVTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)),
v4i32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
@@ -8281,12 +8278,12 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))))),
+ def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)))),
(VCVTTPD2UDQZ128rmb addr:$src)>;
- def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
(v4i32 VR128X:$src0), VK2WM:$mask),
(VCVTTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
v4i32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
}
@@ -8419,12 +8416,12 @@ let Predicates = [HasDQI, HasVLX] in {
VK2WM:$mask),
(VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))),
+ def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
(VCVTQQ2PSZ128rmb addr:$src)>;
- def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
(v4f32 VR128X:$src0), VK2WM:$mask),
(VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
v4f32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
@@ -8448,12 +8445,12 @@ let Predicates = [HasDQI, HasVLX] in {
VK2WM:$mask),
(VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))),
+ def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
(VCVTUQQ2PSZ128rmb addr:$src)>;
- def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
(v4f32 VR128X:$src0), VK2WM:$mask),
(VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
v4f32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
}
@@ -8576,21 +8573,21 @@ let ExeDomain = GenericDomain in {
(ins _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _dest.RC:$dst,
- (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2)))]>,
+ (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>,
Sched<[RR]>;
let Constraints = "$src0 = $dst" in
def rrk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
(ins _dest.RC:$src0, _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _dest.RC:$dst,
- (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2),
+ (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2),
_dest.RC:$src0, _src.KRCWM:$mask))]>,
Sched<[RR]>, EVEX_K;
def rrkz : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
(ins _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}",
[(set _dest.RC:$dst,
- (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2),
+ (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2),
_dest.ImmAllZerosV, _src.KRCWM:$mask))]>,
Sched<[RR]>, EVEX_KZ;
let hasSideEffects = 0, mayStore = 1 in {
@@ -8631,17 +8628,17 @@ let Predicates = [HasAVX512] in {
}
def : Pat<(store (f64 (extractelt
- (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+ (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))),
(iPTR 0))), addr:$dst),
- (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
+ (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>;
def : Pat<(store (i64 (extractelt
- (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+ (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))),
(iPTR 0))), addr:$dst),
- (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
- def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, i32:$src2)), addr:$dst),
- (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, imm:$src2)>;
- def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, i32:$src2)), addr:$dst),
- (VCVTPS2PHZmr addr:$dst, VR512:$src1, imm:$src2)>;
+ (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>;
+ def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst),
+ (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, timm:$src2)>;
+ def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, timm:$src2)), addr:$dst),
+ (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>;
}
// Patterns for matching conversions from float to half-float and vice versa.
@@ -8765,7 +8762,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
(OpNode (_.VT
- (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ (_.BroadcastLdFrag addr:$src)))>,
EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -8859,7 +8856,7 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
(OpNode (_.VT
- (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ (_.BroadcastLdFrag addr:$src)))>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -8940,7 +8937,7 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
(fsqrt (_.VT
- (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ (_.BroadcastLdFrag addr:$src)))>,
EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -9049,14 +9046,14 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$src3)))>,
+ (i32 timm:$src3)))>,
Sched<[sched]>;
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
(_.VT (X86RndScalesSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$src3)))>, EVEX_B,
+ (i32 timm:$src3)))>, EVEX_B,
Sched<[sched]>;
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -9064,7 +9061,7 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86RndScales _.RC:$src1,
- _.ScalarIntMemCPat:$src2, (i32 imm:$src3)))>,
+ _.ScalarIntMemCPat:$src2, (i32 timm:$src3)))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in {
@@ -9082,15 +9079,15 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
}
let Predicates = [HasAVX512] in {
- def : Pat<(X86VRndScale _.FRC:$src1, imm:$src2),
+ def : Pat<(X86VRndScale _.FRC:$src1, timm:$src2),
(_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
- _.FRC:$src1, imm:$src2))>;
+ _.FRC:$src1, timm:$src2))>;
}
let Predicates = [HasAVX512, OptForSize] in {
- def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), imm:$src2),
+ def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2),
(_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
- addr:$src1, imm:$src2))>;
+ addr:$src1, timm:$src2))>;
}
}
@@ -10109,19 +10106,19 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNo
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2))>, Sched<[sched]>;
+ (i32 timm:$src2))>, Sched<[sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
(OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i32 imm:$src2))>,
+ (i32 timm:$src2))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
"${src1}"##_.BroadcastStr##", $src2",
- (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
- (i32 imm:$src2))>, EVEX_B,
+ (OpNode (_.VT (_.BroadcastLdFrag addr:$src1)),
+ (i32 timm:$src2))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10136,7 +10133,7 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
"$src1, {sae}, $src2",
(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2))>,
+ (i32 timm:$src2))>,
EVEX_B, Sched<[sched]>;
}
@@ -10169,22 +10166,22 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3))>,
+ (i32 timm:$src3))>,
Sched<[sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2))),
- (i32 imm:$src3))>,
+ (i32 timm:$src3))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr##", $src3",
(OpNode (_.VT _.RC:$src1),
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- (i32 imm:$src3))>, EVEX_B,
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ (i32 timm:$src3))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10200,7 +10197,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
(SrcInfo.VT SrcInfo.RC:$src2),
- (i8 imm:$src3)))>,
+ (i8 timm:$src3)))>,
Sched<[sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
(ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
@@ -10208,7 +10205,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
(SrcInfo.VT (bitconvert
(SrcInfo.LdFrag addr:$src2))),
- (i8 imm:$src3)))>,
+ (i8 timm:$src3)))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10226,8 +10223,8 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr##", $src3",
(OpNode (_.VT _.RC:$src1),
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- (i8 imm:$src3))>, EVEX_B,
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ (i8 timm:$src3))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -10241,15 +10238,14 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3))>,
+ (i32 timm:$src3))>,
Sched<[sched]>;
defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
- (_.VT (scalar_to_vector
- (_.ScalarLdFrag addr:$src2))),
- (i32 imm:$src3))>,
+ (_.VT _.ScalarIntMemCPat:$src2),
+ (i32 timm:$src3))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10265,7 +10261,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
"$src1, $src2, {sae}, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3))>,
+ (i32 timm:$src3))>,
EVEX_B, Sched<[sched]>;
}
@@ -10279,7 +10275,7 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode
"$src1, $src2, {sae}, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3))>,
+ (i32 timm:$src3))>,
EVEX_B, Sched<[sched]>;
}
@@ -10401,7 +10397,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (bitconvert
(CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2,
- (i8 imm:$src3)))))>,
+ (i8 timm:$src3)))))>,
Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr">;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
@@ -10410,7 +10406,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
(bitconvert
(CastInfo.VT (X86Shuf128 _.RC:$src1,
(CastInfo.LdFrag addr:$src2),
- (i8 imm:$src3)))))>,
+ (i8 timm:$src3)))))>,
Sched<[sched.Folded, sched.ReadAfterFold]>,
EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -10421,8 +10417,8 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
(bitconvert
(CastInfo.VT
(X86Shuf128 _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
- (i8 imm:$src3)))))>, EVEX_B,
+ (_.BroadcastLdFrag addr:$src2),
+ (i8 timm:$src3)))))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10491,14 +10487,14 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr,
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
- (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$src3)))>,
+ (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))>,
Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86VAlign _.RC:$src1,
(bitconvert (_.LdFrag addr:$src2)),
- (i8 imm:$src3)))>,
+ (i8 timm:$src3)))>,
Sched<[sched.Folded, sched.ReadAfterFold]>,
EVEX2VEXOverride<"VPALIGNRrmi">;
@@ -10507,8 +10503,8 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr,
OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr##", $src3",
(X86VAlign _.RC:$src1,
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- (i8 imm:$src3))>, EVEX_B,
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ (i8 timm:$src3))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10541,13 +10537,13 @@ defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr",
// Fragments to help convert valignq into masked valignd. Or valignq/valignd
// into vpalignr.
-def ValignqImm32XForm : SDNodeXForm<imm, [{
+def ValignqImm32XForm : SDNodeXForm<timm, [{
return getI8Imm(N->getZExtValue() * 2, SDLoc(N));
}]>;
-def ValignqImm8XForm : SDNodeXForm<imm, [{
+def ValignqImm8XForm : SDNodeXForm<timm, [{
return getI8Imm(N->getZExtValue() * 8, SDLoc(N));
}]>;
-def ValigndImm8XForm : SDNodeXForm<imm, [{
+def ValigndImm8XForm : SDNodeXForm<timm, [{
return getI8Imm(N->getZExtValue() * 4, SDLoc(N));
}]>;
@@ -10557,40 +10553,40 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1, From.RC:$src2,
- imm:$src3))),
+ timm:$src3))),
To.RC:$src0)),
(!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask,
To.RC:$src1, To.RC:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1, From.RC:$src2,
- imm:$src3))),
+ timm:$src3))),
To.ImmAllZerosV)),
(!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask,
To.RC:$src1, To.RC:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1,
(From.LdFrag addr:$src2),
- imm:$src3))),
+ timm:$src3))),
To.RC:$src0)),
(!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1,
(From.LdFrag addr:$src2),
- imm:$src3))),
+ timm:$src3))),
To.ImmAllZerosV)),
(!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
}
multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
@@ -10599,35 +10595,32 @@ multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
SDNodeXForm ImmXForm> :
avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> {
def : Pat<(From.VT (OpNode From.RC:$src1,
- (bitconvert (To.VT (X86VBroadcast
- (To.ScalarLdFrag addr:$src2)))),
- imm:$src3)),
+ (bitconvert (To.VT (To.BroadcastLdFrag addr:$src2))),
+ timm:$src3)),
(!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1,
(bitconvert
- (To.VT (X86VBroadcast
- (To.ScalarLdFrag addr:$src2)))),
- imm:$src3))),
+ (To.VT (To.BroadcastLdFrag addr:$src2))),
+ timm:$src3))),
To.RC:$src0)),
(!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1,
(bitconvert
- (To.VT (X86VBroadcast
- (To.ScalarLdFrag addr:$src2)))),
- imm:$src3))),
+ (To.VT (To.BroadcastLdFrag addr:$src2))),
+ timm:$src3))),
To.ImmAllZerosV)),
(!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
}
let Predicates = [HasAVX512] in {
@@ -10666,13 +10659,13 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr,
"$src1", "$src1",
- (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase,
+ (_.VT (OpNode (_.VT _.RC:$src1)))>, EVEX, AVX5128IBase,
Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1), OpcodeStr,
"$src1", "$src1",
- (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
+ (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1)))))>,
EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded]>;
}
@@ -10685,8 +10678,7 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.ScalarMemOp:$src1), OpcodeStr,
"${src1}"##_.BroadcastStr,
"${src1}"##_.BroadcastStr,
- (_.VT (OpNode (X86VBroadcast
- (_.ScalarLdFrag addr:$src1))))>,
+ (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src1))))>,
EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded]>;
}
@@ -10770,7 +10762,7 @@ let Predicates = [HasAVX512, NoVLX] in {
multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
AVX512VLVectorVTInfo _, Predicate prd> {
let Predicates = [prd, NoVLX] in {
- def : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)),
+ def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))),
(EXTRACT_SUBREG
(!cast<Instruction>(InstrStr # "Zrr")
(INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
@@ -10778,7 +10770,7 @@ multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
_.info256.SubRegIdx)),
_.info256.SubRegIdx)>;
- def : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)),
+ def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1))),
(EXTRACT_SUBREG
(!cast<Instruction>(InstrStr # "Zrr")
(INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
@@ -10829,17 +10821,16 @@ defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup,
// AVX-512 - MOVDDUP
//===----------------------------------------------------------------------===//
-multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX,
+ (_.VT (X86VBroadcast (_.VT _.RC:$src)))>, EVEX,
Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
- (_.VT (OpNode (_.VT (scalar_to_vector
- (_.ScalarLdFrag addr:$src)))))>,
+ (_.VT (_.BroadcastLdFrag addr:$src))>,
EVEX, EVEX_CD8<_.EltSize, CD8VH>,
Sched<[sched.Folded]>;
}
@@ -10853,7 +10844,7 @@ multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
let Predicates = [HasAVX512, HasVLX] in {
defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.YMM,
VTInfo.info256>, EVEX_V256;
- defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, sched.XMM,
+ defm Z128 : avx512_movddup_128<opc, OpcodeStr, sched.XMM,
VTInfo.info128>, EVEX_V128;
}
}
@@ -10867,11 +10858,9 @@ multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle>;
let Predicates = [HasVLX] in {
-def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
- (VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
+def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
(VMOVDDUPZ128rm addr:$src)>;
@@ -10884,17 +10873,17 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
immAllZerosV),
(VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)),
(v2f64 VR128X:$src0)),
(VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)),
immAllZerosV),
(VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
(v2f64 VR128X:$src0)),
(VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
immAllZerosV),
(VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
}
@@ -11070,14 +11059,14 @@ multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
def rr : AVX512<opc, MRMr,
(outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>,
+ [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 timm:$src2))))]>,
Sched<[sched]>;
def rm : AVX512<opc, MRMm,
(outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.RC:$dst,(_.VT (OpNode
(_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i8 imm:$src2))))]>,
+ (i8 timm:$src2))))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -11104,6 +11093,7 @@ defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
string OpcodeStr, X86FoldableSchedWrite sched,
X86VectorVTInfo _dst, X86VectorVTInfo _src> {
+ let isCommutable = 1 in
def rr : AVX512BI<opc, MRMSrcReg,
(outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -11140,7 +11130,7 @@ defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
// Transforms to swizzle an immediate to enable better matching when
// memory operand isn't in the right place.
-def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG321_imm8 : SDNodeXForm<timm, [{
// Convert a VPTERNLOG immediate by swapping operand 0 and operand 2.
uint8_t Imm = N->getZExtValue();
// Swap bits 1/4 and 3/6.
@@ -11151,7 +11141,7 @@ def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{
if (Imm & 0x40) NewImm |= 0x08;
return getI8Imm(NewImm, SDLoc(N));
}]>;
-def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG213_imm8 : SDNodeXForm<timm, [{
// Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
uint8_t Imm = N->getZExtValue();
// Swap bits 2/4 and 3/5.
@@ -11162,7 +11152,7 @@ def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{
if (Imm & 0x20) NewImm |= 0x08;
return getI8Imm(NewImm, SDLoc(N));
}]>;
-def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG132_imm8 : SDNodeXForm<timm, [{
// Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
uint8_t Imm = N->getZExtValue();
// Swap bits 1/2 and 5/6.
@@ -11173,7 +11163,7 @@ def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{
if (Imm & 0x40) NewImm |= 0x20;
return getI8Imm(NewImm, SDLoc(N));
}]>;
-def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG231_imm8 : SDNodeXForm<timm, [{
// Convert a VPTERNLOG immediate by moving operand 1 to the end.
uint8_t Imm = N->getZExtValue();
// Move bits 1->2, 2->4, 3->6, 4->1, 5->3, 6->5
@@ -11186,7 +11176,7 @@ def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{
if (Imm & 0x40) NewImm |= 0x20;
return getI8Imm(NewImm, SDLoc(N));
}]>;
-def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG312_imm8 : SDNodeXForm<timm, [{
// Convert a VPTERNLOG immediate by moving operand 2 to the beginning.
uint8_t Imm = N->getZExtValue();
// Move bits 1->4, 2->1, 3->5, 4->2, 5->6, 6->3
@@ -11210,7 +11200,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT _.RC:$src3),
- (i8 imm:$src4)), 1, 1>,
+ (i8 timm:$src4)), 1, 1>,
AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
@@ -11218,7 +11208,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT (bitconvert (_.LdFrag addr:$src3))),
- (i8 imm:$src4)), 1, 0>,
+ (i8 timm:$src4)), 1, 0>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -11227,146 +11217,145 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src2, ${src3}"##_.BroadcastStr##", $src4",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
- (i8 imm:$src4)), 1, 0>, EVEX_B,
+ (_.VT (_.BroadcastLdFrag addr:$src3)),
+ (i8 timm:$src4)), 1, 0>, EVEX_B,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}// Constraints = "$src1 = $dst"
// Additional patterns for matching passthru operand in other positions.
def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+ (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+ _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)),
+ (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+ _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
// Additional patterns for matching loads in other positions.
def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4))),
(!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
- addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+ addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (OpNode _.RC:$src1,
(bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, (i8 imm:$src4))),
+ _.RC:$src2, (i8 timm:$src4))),
(!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
- addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+ addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
// Additional patterns for matching zero masking with loads in other
// positions.
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.ImmAllZerosV)),
(!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, (i8 imm:$src4)),
+ _.RC:$src2, (i8 timm:$src4)),
_.ImmAllZerosV)),
(!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
// Additional patterns for matching masked loads with different
// operand orders.
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, (i8 imm:$src4)),
+ _.RC:$src2, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2, _.RC:$src1,
- (bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)),
+ (bitconvert (_.LdFrag addr:$src3)), (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src1, (i8 imm:$src4)),
+ _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
+ _.RC:$src1, _.RC:$src2, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>;
// Additional patterns for matching broadcasts in other positions.
- def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
+ def : Pat<(_.VT (OpNode (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4))),
(!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
- addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+ addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (OpNode _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, (i8 imm:$src4))),
+ (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, (i8 timm:$src4))),
(!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
- addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+ addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
// Additional patterns for matching zero masking with broadcasts in other
// positions.
def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+ (OpNode (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.ImmAllZerosV)),
(!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
_.KRCWM:$mask, _.RC:$src2, addr:$src3,
- (VPTERNLOG321_imm8 imm:$src4))>;
+ (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, (i8 imm:$src4)),
+ (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, (i8 timm:$src4)),
_.ImmAllZerosV)),
(!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
_.KRCWM:$mask, _.RC:$src2, addr:$src3,
- (VPTERNLOG132_imm8 imm:$src4))>;
+ (VPTERNLOG132_imm8 timm:$src4))>;
// Additional patterns for matching masked broadcasts with different
// operand orders.
def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, (i8 imm:$src4)),
+ (OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+ (OpNode (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2, _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- (i8 imm:$src4)), _.RC:$src1)),
+ (_.BroadcastLdFrag addr:$src3),
+ (i8 timm:$src4)), _.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2,
- (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src1, (i8 imm:$src4)),
+ (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
+ (OpNode (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src1, _.RC:$src2, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>;
}
multiclass avx512_common_ternlog<string OpcodeStr, X86SchedWriteWidths sched,
@@ -11387,6 +11376,113 @@ defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU,
defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
avx512vl_i64_info>, VEX_W;
+// Patterns to use VPTERNLOG for vXi16/vXi8 vectors.
+let Predicates = [HasVLX] in {
+ def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
+ (loadv16i8 addr:$src3), (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i8 (X86vpternlog (loadv16i8 addr:$src3), VR128X:$src2,
+ VR128X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v16i8 (X86vpternlog VR128X:$src1, (loadv16i8 addr:$src3),
+ VR128X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
+ timm:$src4)>;
+ def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
+ (loadv8i16 addr:$src3), (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v8i16 (X86vpternlog (loadv8i16 addr:$src3), VR128X:$src2,
+ VR128X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v8i16 (X86vpternlog VR128X:$src1, (loadv8i16 addr:$src3),
+ VR128X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
+ (loadv32i8 addr:$src3), (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i8 (X86vpternlog (loadv32i8 addr:$src3), VR256X:$src2,
+ VR256X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v32i8 (X86vpternlog VR256X:$src1, (loadv32i8 addr:$src3),
+ VR256X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
+ (loadv16i16 addr:$src3), (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i16 (X86vpternlog (loadv16i16 addr:$src3), VR256X:$src2,
+ VR256X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v16i16 (X86vpternlog VR256X:$src1, (loadv16i16 addr:$src3),
+ VR256X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
+ (i8 timm:$src4))),
+ (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
+ timm:$src4)>;
+ def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
+ (loadv64i8 addr:$src3), (i8 timm:$src4))),
+ (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v64i8 (X86vpternlog (loadv64i8 addr:$src3), VR512:$src2,
+ VR512:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v64i8 (X86vpternlog VR512:$src1, (loadv64i8 addr:$src3),
+ VR512:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
+ (i8 timm:$src4))),
+ (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
+ (loadv32i16 addr:$src3), (i8 timm:$src4))),
+ (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i16 (X86vpternlog (loadv32i16 addr:$src3), VR512:$src2,
+ VR512:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v32i16 (X86vpternlog VR512:$src1, (loadv32i16 addr:$src3),
+ VR512:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+}
+
// Patterns to implement vnot using vpternlog instead of creating all ones
// using pcmpeq or vpternlog and then xoring with that. The value 15 is chosen
// so that the result is only dependent on src0. But we use the same source
@@ -11498,14 +11594,14 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
(X86VFixupimm (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(TblVT.VT _.RC:$src3),
- (i32 imm:$src4))>, Sched<[sched]>;
+ (i32 timm:$src4))>, Sched<[sched]>;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
(X86VFixupimm (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))),
- (i32 imm:$src4))>,
+ (i32 timm:$src4))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
@@ -11513,8 +11609,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
"$src2, ${src3}"##_.BroadcastStr##", $src4",
(X86VFixupimm (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))),
- (i32 imm:$src4))>,
+ (TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)),
+ (i32 timm:$src4))>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // Constraints = "$src1 = $dst"
}
@@ -11531,7 +11627,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
(X86VFixupimmSAE (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(TblVT.VT _.RC:$src3),
- (i32 imm:$src4))>,
+ (i32 timm:$src4))>,
EVEX_B, Sched<[sched]>;
}
}
@@ -11547,7 +11643,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
(X86VFixupimms (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_src3VT.VT _src3VT.RC:$src3),
- (i32 imm:$src4))>, Sched<[sched]>;
+ (i32 timm:$src4))>, Sched<[sched]>;
defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
@@ -11555,7 +11651,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
(X86VFixupimmSAEs (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_src3VT.VT _src3VT.RC:$src3),
- (i32 imm:$src4))>,
+ (i32 timm:$src4))>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
@@ -11564,13 +11660,13 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
(_.VT _.RC:$src2),
(_src3VT.VT (scalar_to_vector
(_src3VT.ScalarLdFrag addr:$src3))),
- (i32 imm:$src4))>,
+ (i32 timm:$src4))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched,
- AVX512VLVectorVTInfo _Vec,
+ AVX512VLVectorVTInfo _Vec,
AVX512VLVectorVTInfo _Tbl> {
let Predicates = [HasAVX512] in
defm Z : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM,
@@ -11804,7 +11900,7 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
"${src3}"##VTI.BroadcastStr##", $src2",
"$src2, ${src3}"##VTI.BroadcastStr,
(OpNode VTI.RC:$src1, VTI.RC:$src2,
- (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3))))>,
+ (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
AVX512FMA3Base, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -11880,12 +11976,14 @@ defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", WriteVarShuffle256,
let Constraints = "$src1 = $dst" in
multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
- X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo VTI,
+ bit IsCommutable> {
defm r : AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
(VTI.VT (OpNode VTI.RC:$src1,
- VTI.RC:$src2, VTI.RC:$src3))>,
+ VTI.RC:$src2, VTI.RC:$src3)),
+ IsCommutable, IsCommutable>,
EVEX_4V, T8PD, Sched<[sched]>;
defm m : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
@@ -11899,27 +11997,58 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
OpStr, "${src3}"##VTI.BroadcastStr##", $src2",
"$src2, ${src3}"##VTI.BroadcastStr,
(OpNode VTI.RC:$src1, VTI.RC:$src2,
- (VTI.VT (X86VBroadcast
- (VTI.ScalarLdFrag addr:$src3))))>,
+ (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode,
- X86SchedWriteWidths sched> {
+ X86SchedWriteWidths sched, bit IsCommutable> {
let Predicates = [HasVNNI] in
- defm Z : VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info>, EVEX_V512;
+ defm Z : VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info,
+ IsCommutable>, EVEX_V512;
let Predicates = [HasVNNI, HasVLX] in {
- defm Z256 : VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info>, EVEX_V256;
- defm Z128 : VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info>, EVEX_V128;
+ defm Z256 : VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info,
+ IsCommutable>, EVEX_V128;
}
}
// FIXME: Is there a better scheduler class for VPDP?
-defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul>;
-defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul>;
-defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul>;
-defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul>;
+defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul, 0>;
+defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul, 0>;
+defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1>;
+defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1>;
+
+def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86vpmaddwd node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+
+// Patterns to match VPDPWSSD from existing instructions/intrinsics.
+let Predicates = [HasVNNI] in {
+ def : Pat<(v16i32 (add VR512:$src1,
+ (X86vpmaddwd_su VR512:$src2, VR512:$src3))),
+ (VPDPWSSDZr VR512:$src1, VR512:$src2, VR512:$src3)>;
+ def : Pat<(v16i32 (add VR512:$src1,
+ (X86vpmaddwd_su VR512:$src2, (load addr:$src3)))),
+ (VPDPWSSDZm VR512:$src1, VR512:$src2, addr:$src3)>;
+}
+let Predicates = [HasVNNI,HasVLX] in {
+ def : Pat<(v8i32 (add VR256X:$src1,
+ (X86vpmaddwd_su VR256X:$src2, VR256X:$src3))),
+ (VPDPWSSDZ256r VR256X:$src1, VR256X:$src2, VR256X:$src3)>;
+ def : Pat<(v8i32 (add VR256X:$src1,
+ (X86vpmaddwd_su VR256X:$src2, (load addr:$src3)))),
+ (VPDPWSSDZ256m VR256X:$src1, VR256X:$src2, addr:$src3)>;
+ def : Pat<(v4i32 (add VR128X:$src1,
+ (X86vpmaddwd_su VR128X:$src2, VR128X:$src3))),
+ (VPDPWSSDZ128r VR128X:$src1, VR128X:$src2, VR128X:$src3)>;
+ def : Pat<(v4i32 (add VR128X:$src1,
+ (X86vpmaddwd_su VR128X:$src2, (load addr:$src3)))),
+ (VPDPWSSDZ128m VR128X:$src1, VR128X:$src2, addr:$src3)>;
+}
//===----------------------------------------------------------------------===//
// Bit Algorithms
@@ -12004,8 +12133,8 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1",
"$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3",
(OpNode (VTI.VT VTI.RC:$src1),
- (bitconvert (BcstVTI.VT (X86VBroadcast (loadi64 addr:$src2)))),
- (i8 imm:$src3))>, EVEX_B,
+ (bitconvert (BcstVTI.VT (X86VBroadcastld64 addr:$src2))),
+ (i8 timm:$src3))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -12116,7 +12245,7 @@ multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> {
!strconcat("vp2intersect", _.Suffix, "\t{${src2}", _.BroadcastStr,
", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"),
[(set _.KRPC:$dst, (X86vp2intersect
- _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>,
+ _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>,
EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
}
@@ -12217,12 +12346,12 @@ let Predicates = [HasBF16, HasVLX] in {
(VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>;
def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32
- (X86VBroadcast (loadf32 addr:$src))))),
+ (X86VBroadcastld32 addr:$src)))),
(VCVTNEPS2BF16Z128rmb addr:$src)>;
- def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))),
+ def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)),
(v8i16 VR128X:$src0), VK4WM:$mask),
(VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
- def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))),
+ def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)),
v8i16x_info.ImmAllZerosV, VK4WM:$mask),
(VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>;
}
@@ -12249,7 +12378,7 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr),
(_.VT (OpNode _.RC:$src1, _.RC:$src2,
- (src_v.VT (X86VBroadcast(src_v.ScalarLdFrag addr:$src3)))))>,
+ (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>,
EVEX_B, EVEX_4V;
}
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index e52635f8d48b..1e399a894490 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1271,22 +1271,22 @@ let isCompare = 1 in {
// ANDN Instruction
//
multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
- PatFrag ld_frag> {
+ PatFrag ld_frag, X86FoldableSchedWrite sched> {
def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
- Sched<[WriteALU]>;
+ Sched<[sched]>;
def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, EFLAGS,
(X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
- Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
}
// Complexity is reduced to give and with immediate a chance to match first.
let Predicates = [HasBMI], Defs = [EFLAGS], AddedComplexity = -6 in {
- defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V;
- defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W;
+ defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8PS, VEX_4V;
+ defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8PS, VEX_4V, VEX_W;
}
let Predicates = [HasBMI], AddedComplexity = -6 in {
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index 50aed98112c3..aa45e9b191c1 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -131,11 +131,11 @@ addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
/// reference.
static inline void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand,
unsigned Reg) {
- // Direct memory address is in a form of: Reg, 1 (Scale), NoReg, 0, NoReg.
- MI->getOperand(Operand).setReg(Reg);
+ // Direct memory address is in a form of: Reg/FI, 1 (Scale), NoReg, 0, NoReg.
+ MI->getOperand(Operand).ChangeToRegister(Reg, /*isDef=*/false);
MI->getOperand(Operand + 1).setImm(1);
MI->getOperand(Operand + 2).setReg(0);
- MI->getOperand(Operand + 3).setImm(0);
+ MI->getOperand(Operand + 3).ChangeToImmediate(0);
MI->getOperand(Operand + 4).setReg(0);
}
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index 099f6aa8d8bb..330b8c7a8a43 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -20,19 +20,19 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
: I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond),
"cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst,
- (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>,
+ (X86cmov GR16:$src1, GR16:$src2, timm:$cond, EFLAGS))]>,
TB, OpSize16;
def CMOV32rr
: I<0x40, MRMSrcRegCC, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, ccode:$cond),
"cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst,
- (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>,
+ (X86cmov GR32:$src1, GR32:$src2, timm:$cond, EFLAGS))]>,
TB, OpSize32;
def CMOV64rr
:RI<0x40, MRMSrcRegCC, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, ccode:$cond),
"cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst,
- (X86cmov GR64:$src1, GR64:$src2, imm:$cond, EFLAGS))]>, TB;
+ (X86cmov GR64:$src1, GR64:$src2, timm:$cond, EFLAGS))]>, TB;
}
let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
@@ -41,29 +41,46 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
: I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond),
"cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
- imm:$cond, EFLAGS))]>, TB, OpSize16;
+ timm:$cond, EFLAGS))]>, TB, OpSize16;
def CMOV32rm
: I<0x40, MRMSrcMemCC, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2, ccode:$cond),
"cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
- imm:$cond, EFLAGS))]>, TB, OpSize32;
+ timm:$cond, EFLAGS))]>, TB, OpSize32;
def CMOV64rm
:RI<0x40, MRMSrcMemCC, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2, ccode:$cond),
"cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
- imm:$cond, EFLAGS))]>, TB;
+ timm:$cond, EFLAGS))]>, TB;
} // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
} // isCodeGenOnly = 1, ForceDisassemble = 1
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+ X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue());
+ return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC),
+ SDLoc(N), MVT::i8);
+}]>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+let Predicates = [HasCMov] in {
+ def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS),
+ (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+ def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS),
+ (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+ def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
+ (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+}
+
// SetCC instructions.
let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in {
def SETCCr : I<0x90, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond),
"set${cond}\t$dst",
- [(set GR8:$dst, (X86setcc imm:$cond, EFLAGS))]>,
+ [(set GR8:$dst, (X86setcc timm:$cond, EFLAGS))]>,
TB, Sched<[WriteSETCC]>;
def SETCCm : I<0x90, MRMXmCC, (outs), (ins i8mem:$dst, ccode:$cond),
"set${cond}\t$dst",
- [(store (X86setcc imm:$cond, EFLAGS), addr:$dst)]>,
+ [(store (X86setcc timm:$cond, EFLAGS), addr:$dst)]>,
TB, Sched<[WriteSETCCStore]>;
} // Uses = [EFLAGS]
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index efaccdc9ee96..78d8dd3c0d03 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -542,7 +542,7 @@ multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> {
def CMOV#NAME : I<0, Pseudo,
(outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond),
"#CMOV_"#NAME#" PSEUDO!",
- [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond,
+ [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, timm:$cond,
EFLAGS)))]>;
}
@@ -593,66 +593,66 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
defm _VK64 : CMOVrr_PSEUDO<VK64, v64i1>;
} // usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS]
-def : Pat<(f128 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
+def : Pat<(f128 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
let Predicates = [NoVLX] in {
- def : Pat<(v16i8 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
- def : Pat<(v8i16 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
- def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
- def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
- def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
-
- def : Pat<(v32i8 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
- def : Pat<(v16i16 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
- def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
- def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
- def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
+ def : Pat<(v16i8 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+ def : Pat<(v8i16 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+ def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+ def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+ def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+
+ def : Pat<(v32i8 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+ def : Pat<(v16i16 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+ def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+ def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+ def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
}
let Predicates = [HasVLX] in {
- def : Pat<(v16i8 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
- def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
- def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
- def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
- def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
-
- def : Pat<(v32i8 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
- def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
- def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
- def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
- def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
+ def : Pat<(v16i8 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+ def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+ def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+ def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+ def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+
+ def : Pat<(v32i8 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+ def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+ def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+ def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+ def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
}
-def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
- (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
-def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
- (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
-def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
- (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
-def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
- (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
-def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
- (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
+def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
//===----------------------------------------------------------------------===//
// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
@@ -1126,12 +1126,12 @@ def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
// binary size compared to a regular MOV, but it introduces an unnecessary
// load, so is not suitable for regular or optsize functions.
let Predicates = [OptForMinSize] in {
-def : Pat<(nonvolatile_store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>;
-def : Pat<(nonvolatile_store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>;
-def : Pat<(nonvolatile_store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>;
-def : Pat<(nonvolatile_store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>;
-def : Pat<(nonvolatile_store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
-def : Pat<(nonvolatile_store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
+def : Pat<(simple_store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>;
+def : Pat<(simple_store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>;
+def : Pat<(simple_store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>;
+def : Pat<(simple_store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>;
+def : Pat<(simple_store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
+def : Pat<(simple_store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
}
// In kernel code model, we can get the address of a label
@@ -1276,23 +1276,6 @@ def : Pat<(X86cmp GR32:$src1, 0),
def : Pat<(X86cmp GR64:$src1, 0),
(TEST64rr GR64:$src1, GR64:$src1)>;
-def inv_cond_XFORM : SDNodeXForm<imm, [{
- X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue());
- return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC),
- SDLoc(N), MVT::i8);
-}]>;
-
-// Conditional moves with folded loads with operands swapped and conditions
-// inverted.
-let Predicates = [HasCMov] in {
- def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, imm:$cond, EFLAGS),
- (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
- def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, imm:$cond, EFLAGS),
- (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
- def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, imm:$cond, EFLAGS),
- (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
-}
-
// zextload bool -> zextload byte
// i1 stored in one byte in zero-extended form.
// Upper bits cleanup should be executed before Store.
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index f82e80965b7c..e1e6eea59884 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -75,7 +75,7 @@ let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump],
def JCC_1 : Ii8PCRel <0x70, AddCCFrm, (outs),
(ins brtarget8:$dst, ccode:$cond),
"j${cond}\t$dst",
- [(X86brcond bb:$dst, imm:$cond, EFLAGS)]>;
+ [(X86brcond bb:$dst, timm:$cond, EFLAGS)]>;
let hasSideEffects = 0 in {
def JCC_2 : Ii16PCRel<0x80, AddCCFrm, (outs),
(ins brtarget16:$dst, ccode:$cond),
@@ -145,6 +145,17 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
[(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>,
Sched<[WriteJumpLd]>;
+ // Win64 wants indirect jumps leaving the function to have a REX_W prefix.
+ // These are switched from TAILJMPr/m64_REX in MCInstLower.
+ let isCodeGenOnly = 1, hasREX_WPrefix = 1 in {
+ def JMP64r_REX : I<0xFF, MRM4r, (outs), (ins GR64:$dst),
+ "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJump]>;
+ let mayLoad = 1 in
+ def JMP64m_REX : I<0xFF, MRM4m, (outs), (ins i64mem:$dst),
+ "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJumpLd]>;
+
+ }
+
// Non-tracking jumps for IBT, use with caution.
let isCodeGenOnly = 1 in {
def JMP16r_NT : I<0xFF, MRM4r, (outs), (ins GR16 : $dst), "jmp{w}\t{*}$dst",
@@ -273,39 +284,35 @@ let isCall = 1 in
// Tail call stuff.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
- isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
- let Uses = [ESP, SSP] in {
- def TCRETURNdi : PseudoI<(outs),
- (ins i32imm_pcrel:$dst, i32imm:$offset), []>, NotMemoryFoldable;
- def TCRETURNri : PseudoI<(outs),
- (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable;
+ isCodeGenOnly = 1, Uses = [ESP, SSP] in {
+ def TCRETURNdi : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>, NotMemoryFoldable;
+ def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>, NotMemoryFoldable;
let mayLoad = 1 in
- def TCRETURNmi : PseudoI<(outs),
- (ins i32mem_TC:$dst, i32imm:$offset), []>;
+ def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset),
+ []>, Sched<[WriteJumpLd]>;
- // FIXME: The should be pseudo instructions that are lowered when going to
- // mcinst.
- def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
- (ins i32imm_pcrel:$dst), "jmp\t$dst", []>;
+ def TAILJMPd : PseudoI<(outs), (ins i32imm_pcrel:$dst),
+ []>, Sched<[WriteJump]>;
- def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
- "", []>; // FIXME: Remove encoding when JIT is dead.
+ def TAILJMPr : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
+ []>, Sched<[WriteJump]>;
let mayLoad = 1 in
- def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
- "jmp{l}\t{*}$dst", []>;
+ def TAILJMPm : PseudoI<(outs), (ins i32mem_TC:$dst),
+ []>, Sched<[WriteJumpLd]>;
}
// Conditional tail calls are similar to the above, but they are branches
// rather than barriers, and they use EFLAGS.
let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
- isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+ isCodeGenOnly = 1, SchedRW = [WriteJump] in
let Uses = [ESP, EFLAGS, SSP] in {
def TCRETURNdicc : PseudoI<(outs),
(ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>;
// This gets substituted to a conditional jump instruction in MC lowering.
- def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs),
- (ins i32imm_pcrel:$dst, i32imm:$cond), "", []>;
+ def TAILJMPd_CC : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$cond), []>;
}
@@ -348,34 +355,36 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
}
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
- isCodeGenOnly = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
+ isCodeGenOnly = 1, Uses = [RSP, SSP] in {
def TCRETURNdi64 : PseudoI<(outs),
- (ins i64i32imm_pcrel:$dst, i32imm:$offset),
- []>;
+ (ins i64i32imm_pcrel:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>;
def TCRETURNri64 : PseudoI<(outs),
- (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable;
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>, NotMemoryFoldable;
let mayLoad = 1 in
def TCRETURNmi64 : PseudoI<(outs),
- (ins i64mem_TC:$dst, i32imm:$offset), []>, NotMemoryFoldable;
+ (ins i64mem_TC:$dst, i32imm:$offset),
+ []>, Sched<[WriteJumpLd]>, NotMemoryFoldable;
- def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
- "jmp\t$dst", []>;
+ def TAILJMPd64 : PseudoI<(outs), (ins i64i32imm_pcrel:$dst),
+ []>, Sched<[WriteJump]>;
- def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
- "jmp{q}\t{*}$dst", []>;
+ def TAILJMPr64 : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
+ []>, Sched<[WriteJump]>;
let mayLoad = 1 in
- def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
- "jmp{q}\t{*}$dst", []>;
+ def TAILJMPm64 : PseudoI<(outs), (ins i64mem_TC:$dst),
+ []>, Sched<[WriteJumpLd]>;
// Win64 wants indirect jumps leaving the function to have a REX_W prefix.
let hasREX_WPrefix = 1 in {
- def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
- "rex64 jmp{q}\t{*}$dst", []>;
+ def TAILJMPr64_REX : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
+ []>, Sched<[WriteJump]>;
let mayLoad = 1 in
- def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
- "rex64 jmp{q}\t{*}$dst", []>;
+ def TAILJMPm64_REX : PseudoI<(outs), (ins i64mem_TC:$dst),
+ []>, Sched<[WriteJumpLd]>;
}
}
@@ -403,13 +412,13 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
// Conditional tail calls are similar to the above, but they are branches
// rather than barriers, and they use EFLAGS.
let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
- isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+ isCodeGenOnly = 1, SchedRW = [WriteJump] in
let Uses = [RSP, EFLAGS, SSP] in {
def TCRETURNdi64cc : PseudoI<(outs),
(ins i64i32imm_pcrel:$dst, i32imm:$offset,
i32imm:$cond), []>;
// This gets substituted to a conditional jump instruction in MC lowering.
- def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs),
- (ins i64i32imm_pcrel:$dst, i32imm:$cond), "", []>;
+ def TAILJMPd64_CC : PseudoI<(outs),
+ (ins i64i32imm_pcrel:$dst, i32imm:$cond), []>;
}
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 06e605fe5db2..7a4eb138ec34 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -17,19 +17,18 @@ let hasSideEffects = 0 in {
let Defs = [EAX], Uses = [AX] in // EAX = signext(AX)
def CWDE : I<0x98, RawFrm, (outs), (ins),
"{cwtl|cwde}", []>, OpSize32, Sched<[WriteALU]>;
+ let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX)
+ def CDQE : RI<0x98, RawFrm, (outs), (ins),
+ "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
+ // FIXME: CWD/CDQ/CQO shouldn't Def the A register, but the fast register
+ // allocator crashes if you remove it.
let Defs = [AX,DX], Uses = [AX] in // DX:AX = signext(AX)
def CWD : I<0x99, RawFrm, (outs), (ins),
"{cwtd|cwd}", []>, OpSize16, Sched<[WriteALU]>;
let Defs = [EAX,EDX], Uses = [EAX] in // EDX:EAX = signext(EAX)
def CDQ : I<0x99, RawFrm, (outs), (ins),
"{cltd|cdq}", []>, OpSize32, Sched<[WriteALU]>;
-
-
- let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX)
- def CDQE : RI<0x98, RawFrm, (outs), (ins),
- "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
-
let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX)
def CQO : RI<0x99, RawFrm, (outs), (ins),
"{cqto|cqo}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrFoldTables.cpp b/lib/Target/X86/X86InstrFoldTables.cpp
index d42fec3770c7..f3b286e0375c 100644
--- a/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/lib/Target/X86/X86InstrFoldTables.cpp
@@ -292,6 +292,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::JMP32r_NT, X86::JMP32m_NT, TB_FOLDED_LOAD },
{ X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
{ X86::JMP64r_NT, X86::JMP64m_NT, TB_FOLDED_LOAD },
+ { X86::MMX_MOVD64from64rr, X86::MMX_MOVD64from64rm, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MMX_MOVD64grr, X86::MMX_MOVD64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
{ X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
{ X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
@@ -5245,6 +5247,270 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VXORPSZrrk, X86::VXORPSZrmk, 0 },
};
+static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = {
+ { X86::VADDPDZ128rr, X86::VADDPDZ128rmb, TB_BCAST_SD },
+ { X86::VADDPDZ256rr, X86::VADDPDZ256rmb, TB_BCAST_SD },
+ { X86::VADDPDZrr, X86::VADDPDZrmb, TB_BCAST_SD },
+ { X86::VADDPSZ128rr, X86::VADDPSZ128rmb, TB_BCAST_SS },
+ { X86::VADDPSZ256rr, X86::VADDPSZ256rmb, TB_BCAST_SS },
+ { X86::VADDPSZrr, X86::VADDPSZrmb, TB_BCAST_SS },
+ { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmbi, TB_BCAST_SD },
+ { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmbi, TB_BCAST_SD },
+ { X86::VCMPPDZrri, X86::VCMPPDZrmbi, TB_BCAST_SD },
+ { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmbi, TB_BCAST_SS },
+ { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmbi, TB_BCAST_SS },
+ { X86::VCMPPSZrri, X86::VCMPPSZrmbi, TB_BCAST_SS },
+ { X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD },
+ { X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD },
+ { X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD },
+ { X86::VDIVPSZ128rr, X86::VDIVPSZ128rmb, TB_BCAST_SS },
+ { X86::VDIVPSZ256rr, X86::VDIVPSZ256rmb, TB_BCAST_SS },
+ { X86::VDIVPSZrr, X86::VDIVPSZrmb, TB_BCAST_SS },
+ { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rmb, TB_BCAST_SD },
+ { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rmb, TB_BCAST_SD },
+ { X86::VMAXCPDZrr, X86::VMAXCPDZrmb, TB_BCAST_SD },
+ { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rmb, TB_BCAST_SS },
+ { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rmb, TB_BCAST_SS },
+ { X86::VMAXCPSZrr, X86::VMAXCPSZrmb, TB_BCAST_SS },
+ { X86::VMAXPDZ128rr, X86::VMAXPDZ128rmb, TB_BCAST_SD },
+ { X86::VMAXPDZ256rr, X86::VMAXPDZ256rmb, TB_BCAST_SD },
+ { X86::VMAXPDZrr, X86::VMAXPDZrmb, TB_BCAST_SD },
+ { X86::VMAXPSZ128rr, X86::VMAXPSZ128rmb, TB_BCAST_SS },
+ { X86::VMAXPSZ256rr, X86::VMAXPSZ256rmb, TB_BCAST_SS },
+ { X86::VMAXPSZrr, X86::VMAXPSZrmb, TB_BCAST_SS },
+ { X86::VMINCPDZ128rr, X86::VMINCPDZ128rmb, TB_BCAST_SD },
+ { X86::VMINCPDZ256rr, X86::VMINCPDZ256rmb, TB_BCAST_SD },
+ { X86::VMINCPDZrr, X86::VMINCPDZrmb, TB_BCAST_SD },
+ { X86::VMINCPSZ128rr, X86::VMINCPSZ128rmb, TB_BCAST_SS },
+ { X86::VMINCPSZ256rr, X86::VMINCPSZ256rmb, TB_BCAST_SS },
+ { X86::VMINCPSZrr, X86::VMINCPSZrmb, TB_BCAST_SS },
+ { X86::VMINPDZ128rr, X86::VMINPDZ128rmb, TB_BCAST_SD },
+ { X86::VMINPDZ256rr, X86::VMINPDZ256rmb, TB_BCAST_SD },
+ { X86::VMINPDZrr, X86::VMINPDZrmb, TB_BCAST_SD },
+ { X86::VMINPSZ128rr, X86::VMINPSZ128rmb, TB_BCAST_SS },
+ { X86::VMINPSZ256rr, X86::VMINPSZ256rmb, TB_BCAST_SS },
+ { X86::VMINPSZrr, X86::VMINPSZrmb, TB_BCAST_SS },
+ { X86::VMULPDZ128rr, X86::VMULPDZ128rmb, TB_BCAST_SD },
+ { X86::VMULPDZ256rr, X86::VMULPDZ256rmb, TB_BCAST_SD },
+ { X86::VMULPDZrr, X86::VMULPDZrmb, TB_BCAST_SD },
+ { X86::VMULPSZ128rr, X86::VMULPSZ128rmb, TB_BCAST_SS },
+ { X86::VMULPSZ256rr, X86::VMULPSZ256rmb, TB_BCAST_SS },
+ { X86::VMULPSZrr, X86::VMULPSZrmb, TB_BCAST_SS },
+ { X86::VPADDDZ128rr, X86::VPADDDZ128rmb, TB_BCAST_D },
+ { X86::VPADDDZ256rr, X86::VPADDDZ256rmb, TB_BCAST_D },
+ { X86::VPADDDZrr, X86::VPADDDZrmb, TB_BCAST_D },
+ { X86::VPADDQZ128rr, X86::VPADDQZ128rmb, TB_BCAST_Q },
+ { X86::VPADDQZ256rr, X86::VPADDQZ256rmb, TB_BCAST_Q },
+ { X86::VPADDQZrr, X86::VPADDQZrmb, TB_BCAST_Q },
+ { X86::VPANDDZ128rr, X86::VPANDDZ128rmb, TB_BCAST_D },
+ { X86::VPANDDZ256rr, X86::VPANDDZ256rmb, TB_BCAST_D },
+ { X86::VPANDDZrr, X86::VPANDDZrmb, TB_BCAST_D },
+ { X86::VPANDNDZ128rr, X86::VPANDNDZ128rmb, TB_BCAST_D },
+ { X86::VPANDNDZ256rr, X86::VPANDNDZ256rmb, TB_BCAST_D },
+ { X86::VPANDNDZrr, X86::VPANDNDZrmb, TB_BCAST_D },
+ { X86::VPANDNQZ128rr, X86::VPANDNQZ128rmb, TB_BCAST_Q },
+ { X86::VPANDNQZ256rr, X86::VPANDNQZ256rmb, TB_BCAST_Q },
+ { X86::VPANDNQZrr, X86::VPANDNQZrmb, TB_BCAST_Q },
+ { X86::VPANDQZ128rr, X86::VPANDQZ128rmb, TB_BCAST_Q },
+ { X86::VPANDQZ256rr, X86::VPANDQZ256rmb, TB_BCAST_Q },
+ { X86::VPANDQZrr, X86::VPANDQZrmb, TB_BCAST_Q },
+ { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmib, TB_BCAST_D },
+ { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmib, TB_BCAST_D },
+ { X86::VPCMPDZrri, X86::VPCMPDZrmib, TB_BCAST_D },
+ { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rmb, TB_BCAST_D },
+ { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rmb, TB_BCAST_D },
+ { X86::VPCMPEQDZrr, X86::VPCMPEQDZrmb, TB_BCAST_D },
+ { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rmb, TB_BCAST_Q },
+ { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rmb, TB_BCAST_Q },
+ { X86::VPCMPEQQZrr, X86::VPCMPEQQZrmb, TB_BCAST_Q },
+ { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rmb, TB_BCAST_D },
+ { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rmb, TB_BCAST_D },
+ { X86::VPCMPGTDZrr, X86::VPCMPGTDZrmb, TB_BCAST_D },
+ { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rmb, TB_BCAST_Q },
+ { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rmb, TB_BCAST_Q },
+ { X86::VPCMPGTQZrr, X86::VPCMPGTQZrmb, TB_BCAST_Q },
+ { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmib, TB_BCAST_Q },
+ { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmib, TB_BCAST_Q },
+ { X86::VPCMPQZrri, X86::VPCMPQZrmib, TB_BCAST_Q },
+ { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmib, TB_BCAST_D },
+ { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmib, TB_BCAST_D },
+ { X86::VPCMPUDZrri, X86::VPCMPUDZrmib, TB_BCAST_D },
+ { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmib, TB_BCAST_Q },
+ { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmib, TB_BCAST_Q },
+ { X86::VPCMPUQZrri, X86::VPCMPUQZrmib, TB_BCAST_Q },
+ { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rmb, TB_BCAST_D },
+ { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rmb, TB_BCAST_D },
+ { X86::VPMAXSDZrr, X86::VPMAXSDZrmb, TB_BCAST_D },
+ { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rmb, TB_BCAST_Q },
+ { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rmb, TB_BCAST_Q },
+ { X86::VPMAXSQZrr, X86::VPMAXSQZrmb, TB_BCAST_Q },
+ { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rmb, TB_BCAST_D },
+ { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rmb, TB_BCAST_D },
+ { X86::VPMAXUDZrr, X86::VPMAXUDZrmb, TB_BCAST_D },
+ { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rmb, TB_BCAST_Q },
+ { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rmb, TB_BCAST_Q },
+ { X86::VPMAXUQZrr, X86::VPMAXUQZrmb, TB_BCAST_Q },
+ { X86::VPMINSDZ128rr, X86::VPMINSDZ128rmb, TB_BCAST_D },
+ { X86::VPMINSDZ256rr, X86::VPMINSDZ256rmb, TB_BCAST_D },
+ { X86::VPMINSDZrr, X86::VPMINSDZrmb, TB_BCAST_D },
+ { X86::VPMINSQZ128rr, X86::VPMINSQZ128rmb, TB_BCAST_Q },
+ { X86::VPMINSQZ256rr, X86::VPMINSQZ256rmb, TB_BCAST_Q },
+ { X86::VPMINSQZrr, X86::VPMINSQZrmb, TB_BCAST_Q },
+ { X86::VPMINUDZ128rr, X86::VPMINUDZ128rmb, TB_BCAST_D },
+ { X86::VPMINUDZ256rr, X86::VPMINUDZ256rmb, TB_BCAST_D },
+ { X86::VPMINUDZrr, X86::VPMINUDZrmb, TB_BCAST_D },
+ { X86::VPMINUQZ128rr, X86::VPMINUQZ128rmb, TB_BCAST_Q },
+ { X86::VPMINUQZ256rr, X86::VPMINUQZ256rmb, TB_BCAST_Q },
+ { X86::VPMINUQZrr, X86::VPMINUQZrmb, TB_BCAST_Q },
+ { X86::VPMULLDZ128rr, X86::VPMULLDZ128rmb, TB_BCAST_D },
+ { X86::VPMULLDZ256rr, X86::VPMULLDZ256rmb, TB_BCAST_D },
+ { X86::VPMULLDZrr, X86::VPMULLDZrmb, TB_BCAST_D },
+ { X86::VPMULLQZ128rr, X86::VPMULLQZ128rmb, TB_BCAST_Q },
+ { X86::VPMULLQZ256rr, X86::VPMULLQZ256rmb, TB_BCAST_Q },
+ { X86::VPMULLQZrr, X86::VPMULLQZrmb, TB_BCAST_Q },
+ { X86::VPORDZ128rr, X86::VPORDZ128rmb, TB_BCAST_D },
+ { X86::VPORDZ256rr, X86::VPORDZ256rmb, TB_BCAST_D },
+ { X86::VPORDZrr, X86::VPORDZrmb, TB_BCAST_D },
+ { X86::VPORQZ128rr, X86::VPORQZ128rmb, TB_BCAST_Q },
+ { X86::VPORQZ256rr, X86::VPORQZ256rmb, TB_BCAST_Q },
+ { X86::VPORQZrr, X86::VPORQZrmb, TB_BCAST_Q },
+ { X86::VPTESTMDZ128rr, X86::VPTESTMDZ128rmb, TB_BCAST_D },
+ { X86::VPTESTMDZ256rr, X86::VPTESTMDZ256rmb, TB_BCAST_D },
+ { X86::VPTESTMDZrr, X86::VPTESTMDZrmb, TB_BCAST_D },
+ { X86::VPTESTMQZ128rr, X86::VPTESTMQZ128rmb, TB_BCAST_Q },
+ { X86::VPTESTMQZ256rr, X86::VPTESTMQZ256rmb, TB_BCAST_Q },
+ { X86::VPTESTMQZrr, X86::VPTESTMQZrmb, TB_BCAST_Q },
+ { X86::VPTESTNMDZ128rr,X86::VPTESTNMDZ128rmb,TB_BCAST_D },
+ { X86::VPTESTNMDZ256rr,X86::VPTESTNMDZ256rmb,TB_BCAST_D },
+ { X86::VPTESTNMDZrr, X86::VPTESTNMDZrmb, TB_BCAST_D },
+ { X86::VPTESTNMQZ128rr,X86::VPTESTNMQZ128rmb,TB_BCAST_Q },
+ { X86::VPTESTNMQZ256rr,X86::VPTESTNMQZ256rmb,TB_BCAST_Q },
+ { X86::VPTESTNMQZrr, X86::VPTESTNMQZrmb, TB_BCAST_Q },
+ { X86::VPXORDZ128rr, X86::VPXORDZ128rmb, TB_BCAST_D },
+ { X86::VPXORDZ256rr, X86::VPXORDZ256rmb, TB_BCAST_D },
+ { X86::VPXORDZrr, X86::VPXORDZrmb, TB_BCAST_D },
+ { X86::VPXORQZ128rr, X86::VPXORQZ128rmb, TB_BCAST_Q },
+ { X86::VPXORQZ256rr, X86::VPXORQZ256rmb, TB_BCAST_Q },
+ { X86::VPXORQZrr, X86::VPXORQZrmb, TB_BCAST_Q },
+ { X86::VSUBPDZ128rr, X86::VSUBPDZ128rmb, TB_BCAST_SD },
+ { X86::VSUBPDZ256rr, X86::VSUBPDZ256rmb, TB_BCAST_SD },
+ { X86::VSUBPDZrr, X86::VSUBPDZrmb, TB_BCAST_SD },
+ { X86::VSUBPSZ128rr, X86::VSUBPSZ128rmb, TB_BCAST_SS },
+ { X86::VSUBPSZ256rr, X86::VSUBPSZ256rmb, TB_BCAST_SS },
+ { X86::VSUBPSZrr, X86::VSUBPSZrmb, TB_BCAST_SS },
+};
+
+static const X86MemoryFoldTableEntry BroadcastFoldTable3[] = {
+ { X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADD132PDZr, X86::VFMADD132PDZmb, TB_BCAST_SD },
+ { X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADD132PSZr, X86::VFMADD132PSZmb, TB_BCAST_SS },
+ { X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADD213PDZr, X86::VFMADD213PDZmb, TB_BCAST_SD },
+ { X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADD213PSZr, X86::VFMADD213PSZmb, TB_BCAST_SS },
+ { X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADD231PDZr, X86::VFMADD231PDZmb, TB_BCAST_SD },
+ { X86::VFMADD231PSZ128r, X86::VFMADD231PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADD231PSZ256r, X86::VFMADD231PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADD231PSZr, X86::VFMADD231PSZmb, TB_BCAST_SS },
+ { X86::VFMADDSUB132PDZ128r, X86::VFMADDSUB132PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADDSUB132PDZ256r, X86::VFMADDSUB132PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADDSUB132PDZr, X86::VFMADDSUB132PDZmb, TB_BCAST_SD },
+ { X86::VFMADDSUB132PSZ128r, X86::VFMADDSUB132PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADDSUB132PSZ256r, X86::VFMADDSUB132PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADDSUB132PSZr, X86::VFMADDSUB132PSZmb, TB_BCAST_SS },
+ { X86::VFMADDSUB213PDZ128r, X86::VFMADDSUB213PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADDSUB213PDZ256r, X86::VFMADDSUB213PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADDSUB213PDZr, X86::VFMADDSUB213PDZmb, TB_BCAST_SD },
+ { X86::VFMADDSUB213PSZ128r, X86::VFMADDSUB213PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADDSUB213PSZ256r, X86::VFMADDSUB213PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADDSUB213PSZr, X86::VFMADDSUB213PSZmb, TB_BCAST_SS },
+ { X86::VFMADDSUB231PDZ128r, X86::VFMADDSUB231PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADDSUB231PDZ256r, X86::VFMADDSUB231PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADDSUB231PDZr, X86::VFMADDSUB231PDZmb, TB_BCAST_SD },
+ { X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADDSUB231PSZr, X86::VFMADDSUB231PSZmb, TB_BCAST_SS },
+ { X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUB132PDZr, X86::VFMSUB132PDZmb, TB_BCAST_SD },
+ { X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUB132PSZr, X86::VFMSUB132PSZmb, TB_BCAST_SS },
+ { X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUB213PDZr, X86::VFMSUB213PDZmb, TB_BCAST_SD },
+ { X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUB213PSZr, X86::VFMSUB213PSZmb, TB_BCAST_SS },
+ { X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUB231PDZr, X86::VFMSUB231PDZmb, TB_BCAST_SD },
+ { X86::VFMSUB231PSZ128r, X86::VFMSUB231PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUB231PSZ256r, X86::VFMSUB231PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUB231PSZr, X86::VFMSUB231PSZmb, TB_BCAST_SS },
+ { X86::VFMSUBADD132PDZ128r, X86::VFMSUBADD132PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUBADD132PDZ256r, X86::VFMSUBADD132PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUBADD132PDZr, X86::VFMSUBADD132PDZmb, TB_BCAST_SD },
+ { X86::VFMSUBADD132PSZ128r, X86::VFMSUBADD132PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUBADD132PSZ256r, X86::VFMSUBADD132PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUBADD132PSZr, X86::VFMSUBADD132PSZmb, TB_BCAST_SS },
+ { X86::VFMSUBADD213PDZ128r, X86::VFMSUBADD213PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUBADD213PDZ256r, X86::VFMSUBADD213PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUBADD213PDZr, X86::VFMSUBADD213PDZmb, TB_BCAST_SD },
+ { X86::VFMSUBADD213PSZ128r, X86::VFMSUBADD213PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUBADD213PSZ256r, X86::VFMSUBADD213PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUBADD213PSZr, X86::VFMSUBADD213PSZmb, TB_BCAST_SS },
+ { X86::VFMSUBADD231PDZ128r, X86::VFMSUBADD231PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUBADD231PDZ256r, X86::VFMSUBADD231PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUBADD231PDZr, X86::VFMSUBADD231PDZmb, TB_BCAST_SD },
+ { X86::VFMSUBADD231PSZ128r, X86::VFMSUBADD231PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUBADD231PSZ256r, X86::VFMSUBADD231PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUBADD231PSZr, X86::VFMSUBADD231PSZmb, TB_BCAST_SS },
+ { X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMADD132PDZr, X86::VFNMADD132PDZmb, TB_BCAST_SD },
+ { X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMADD132PSZr, X86::VFNMADD132PSZmb, TB_BCAST_SS },
+ { X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMADD213PDZr, X86::VFNMADD213PDZmb, TB_BCAST_SD },
+ { X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMADD213PSZr, X86::VFNMADD213PSZmb, TB_BCAST_SS },
+ { X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMADD231PDZr, X86::VFNMADD231PDZmb, TB_BCAST_SD },
+ { X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMADD231PSZr, X86::VFNMADD231PSZmb, TB_BCAST_SS },
+ { X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZmb, TB_BCAST_SD },
+ { X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMSUB132PSZr, X86::VFNMSUB132PSZmb, TB_BCAST_SS },
+ { X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZmb, TB_BCAST_SD },
+ { X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMSUB213PSZr, X86::VFNMSUB213PSZmb, TB_BCAST_SS },
+ { X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZmb, TB_BCAST_SD },
+ { X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZmb, TB_BCAST_SS },
+};
+
static const X86MemoryFoldTableEntry *
lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
#ifndef NDEBUG
@@ -5287,6 +5553,18 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
std::end(MemoryFoldTable4)) ==
std::end(MemoryFoldTable4) &&
"MemoryFoldTable4 is not sorted and unique!");
+ assert(std::is_sorted(std::begin(BroadcastFoldTable2),
+ std::end(BroadcastFoldTable2)) &&
+ std::adjacent_find(std::begin(BroadcastFoldTable2),
+ std::end(BroadcastFoldTable2)) ==
+ std::end(BroadcastFoldTable2) &&
+ "BroadcastFoldTable2 is not sorted and unique!");
+ assert(std::is_sorted(std::begin(BroadcastFoldTable3),
+ std::end(BroadcastFoldTable3)) &&
+ std::adjacent_find(std::begin(BroadcastFoldTable3),
+ std::end(BroadcastFoldTable3)) ==
+ std::end(BroadcastFoldTable3) &&
+ "BroadcastFoldTable3 is not sorted and unique!");
FoldTablesChecked.store(true, std::memory_order_relaxed);
}
#endif
@@ -5355,6 +5633,15 @@ struct X86MemUnfoldTable {
// Index 4, folded load
addTableEntry(Entry, TB_INDEX_4 | TB_FOLDED_LOAD);
+ // Broadcast tables.
+ for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable2)
+ // Index 2, folded broadcast
+ addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
+
+ for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable3)
+ // Index 2, folded broadcast
+ addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
+
// Sort the memory->reg unfold table.
array_pod_sort(Table.begin(), Table.end());
diff --git a/lib/Target/X86/X86InstrFoldTables.h b/lib/Target/X86/X86InstrFoldTables.h
index 419baf98f61d..7dc236a0d7e4 100644
--- a/lib/Target/X86/X86InstrFoldTables.h
+++ b/lib/Target/X86/X86InstrFoldTables.h
@@ -19,35 +19,48 @@ namespace llvm {
enum {
// Select which memory operand is being unfolded.
- // (stored in bits 0 - 3)
+ // (stored in bits 0 - 2)
TB_INDEX_0 = 0,
TB_INDEX_1 = 1,
TB_INDEX_2 = 2,
TB_INDEX_3 = 3,
TB_INDEX_4 = 4,
- TB_INDEX_MASK = 0xf,
+ TB_INDEX_MASK = 0x7,
// Do not insert the reverse map (MemOp -> RegOp) into the table.
// This may be needed because there is a many -> one mapping.
- TB_NO_REVERSE = 1 << 4,
+ TB_NO_REVERSE = 1 << 3,
// Do not insert the forward map (RegOp -> MemOp) into the table.
// This is needed for Native Client, which prohibits branch
// instructions from using a memory operand.
- TB_NO_FORWARD = 1 << 5,
+ TB_NO_FORWARD = 1 << 4,
- TB_FOLDED_LOAD = 1 << 6,
- TB_FOLDED_STORE = 1 << 7,
+ TB_FOLDED_LOAD = 1 << 5,
+ TB_FOLDED_STORE = 1 << 6,
+ TB_FOLDED_BCAST = 1 << 7,
// Minimum alignment required for load/store.
- // Used for RegOp->MemOp conversion.
- // (stored in bits 8 - 15)
+ // Used for RegOp->MemOp conversion. Encoded as Log2(Align) + 1 to allow 0
+ // to mean align of 0.
+ // (stored in bits 8 - 11)
TB_ALIGN_SHIFT = 8,
- TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
- TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
- TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
- TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT,
- TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT
+ TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
+ TB_ALIGN_16 = 5 << TB_ALIGN_SHIFT,
+ TB_ALIGN_32 = 6 << TB_ALIGN_SHIFT,
+ TB_ALIGN_64 = 7 << TB_ALIGN_SHIFT,
+ TB_ALIGN_MASK = 0xf << TB_ALIGN_SHIFT,
+
+ // Broadcast type.
+ // (stored in bits 12 - 13)
+ TB_BCAST_TYPE_SHIFT = 12,
+ TB_BCAST_D = 0 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_Q = 1 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_SS = 2 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_SD = 3 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_MASK = 0x3 << TB_BCAST_TYPE_SHIFT,
+
+ // Unused bits 14-15
};
// This struct is used for both the folding and unfold tables. They KeyOp
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 096cc27861ca..de6f8a81dff6 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -103,6 +103,8 @@ def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86VBroadcastld : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisInt<1>,
@@ -954,6 +956,26 @@ def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr),
return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
}]>;
+def X86VBroadcastld8 : PatFrag<(ops node:$src),
+ (X86VBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 1;
+}]>;
+
+def X86VBroadcastld16 : PatFrag<(ops node:$src),
+ (X86VBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 2;
+}]>;
+
+def X86VBroadcastld32 : PatFrag<(ops node:$src),
+ (X86VBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4;
+}]>;
+
+def X86VBroadcastld64 : PatFrag<(ops node:$src),
+ (X86VBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
+}]>;
+
def fp32imm0 : PatLeaf<(f32 fpimm), [{
return N->isExactlyValue(+0.0);
@@ -963,6 +985,10 @@ def fp64imm0 : PatLeaf<(f64 fpimm), [{
return N->isExactlyValue(+0.0);
}]>;
+def fp128imm0 : PatLeaf<(f128 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
// to VEXTRACTF128/VEXTRACTI128 imm.
def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index dbe45356c42b..c29029daeec9 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -30,7 +30,7 @@
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
@@ -465,7 +465,7 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
// Don't waste compile time scanning use-def chains of physregs.
- if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
+ if (!Register::isVirtualRegister(BaseReg))
return false;
bool isPICBase = false;
for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
@@ -480,9 +480,50 @@ static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
}
bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AliasAnalysis *AA) const {
+ AAResults *AA) const {
switch (MI.getOpcode()) {
- default: break;
+ default:
+ // This function should only be called for opcodes with the ReMaterializable
+ // flag set.
+ llvm_unreachable("Unknown rematerializable operation!");
+ break;
+
+ case X86::LOAD_STACK_GUARD:
+ case X86::AVX1_SETALLONES:
+ case X86::AVX2_SETALLONES:
+ case X86::AVX512_128_SET0:
+ case X86::AVX512_256_SET0:
+ case X86::AVX512_512_SET0:
+ case X86::AVX512_512_SETALLONES:
+ case X86::AVX512_FsFLD0SD:
+ case X86::AVX512_FsFLD0SS:
+ case X86::AVX512_FsFLD0F128:
+ case X86::AVX_SET0:
+ case X86::FsFLD0SD:
+ case X86::FsFLD0SS:
+ case X86::FsFLD0F128:
+ case X86::KSET0D:
+ case X86::KSET0Q:
+ case X86::KSET0W:
+ case X86::KSET1D:
+ case X86::KSET1Q:
+ case X86::KSET1W:
+ case X86::MMX_SET0:
+ case X86::MOV32ImmSExti8:
+ case X86::MOV32r0:
+ case X86::MOV32r1:
+ case X86::MOV32r_1:
+ case X86::MOV32ri64:
+ case X86::MOV64ImmSExti8:
+ case X86::V_SET0:
+ case X86::V_SETALLONES:
+ case X86::MOV16ri:
+ case X86::MOV32ri:
+ case X86::MOV64ri:
+ case X86::MOV64ri32:
+ case X86::MOV8ri:
+ return true;
+
case X86::MOV8rm:
case X86::MOV8rm_NOREX:
case X86::MOV16rm:
@@ -561,7 +602,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
MI.isDereferenceableInvariantLoad(AA)) {
- unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
+ Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
if (BaseReg == 0 || BaseReg == X86::RIP)
return true;
// Allow re-materialization of PIC load.
@@ -583,7 +624,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
// lea fi#, lea GV, etc. are all rematerializable.
if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
return true;
- unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
+ Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
if (BaseReg == 0)
return true;
// Allow re-materialization of lea PICBase + x.
@@ -594,10 +635,6 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
return false;
}
}
-
- // All other instructions marked M_REMATERIALIZABLE are always trivially
- // rematerializable.
- return true;
}
void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
@@ -664,7 +701,7 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
}
bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
- unsigned Opc, bool AllowSP, unsigned &NewSrc,
+ unsigned Opc, bool AllowSP, Register &NewSrc,
bool &isKill, MachineOperand &ImplicitOp,
LiveVariables *LV) const {
MachineFunction &MF = *MI.getParent()->getParent();
@@ -675,7 +712,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
RC = Opc != X86::LEA32r ?
&X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
}
- unsigned SrcReg = Src.getReg();
+ Register SrcReg = Src.getReg();
// For both LEA64 and LEA32 the register already has essentially the right
// type (32-bit or 64-bit) we may just need to forbid SP.
@@ -684,7 +721,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
isKill = Src.isKill();
assert(!Src.isUndef() && "Undef op doesn't need optimization");
- if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
+ if (Register::isVirtualRegister(NewSrc) &&
!MF.getRegInfo().constrainRegClass(NewSrc, RC))
return false;
@@ -693,7 +730,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
// This is for an LEA64_32r and incoming registers are 32-bit. One way or
// another we need to add 64-bit registers to the final MI.
- if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ if (Register::isPhysicalRegister(SrcReg)) {
ImplicitOp = Src;
ImplicitOp.setImplicit();
@@ -740,8 +777,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
return nullptr;
unsigned Opcode = X86::LEA64_32r;
- unsigned InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
- unsigned OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
// Build and insert into an implicit UNDEF value. This is OK because
// we will be shifting and then extracting the lower 8/16-bits.
@@ -751,8 +788,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
// But testing has shown this *does* help performance in 64-bit mode (at
// least on modern x86 machines).
MachineBasicBlock::iterator MBBI = MI.getIterator();
- unsigned Dest = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
+ Register Dest = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
bool IsDead = MI.getOperand(0).isDead();
bool IsKill = MI.getOperand(1).isKill();
unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
@@ -794,7 +831,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
case X86::ADD8rr_DB:
case X86::ADD16rr:
case X86::ADD16rr_DB: {
- unsigned Src2 = MI.getOperand(2).getReg();
+ Register Src2 = MI.getOperand(2).getReg();
bool IsKill2 = MI.getOperand(2).isKill();
assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
unsigned InRegLEA2 = 0;
@@ -888,7 +925,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
// LEA can't handle RSP.
- if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+ if (Register::isVirtualRegister(Src.getReg()) &&
!MF.getRegInfo().constrainRegClass(Src.getReg(),
&X86::GR64_NOSPRegClass))
return nullptr;
@@ -911,7 +948,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
// LEA can't handle ESP.
bool isKill;
- unsigned SrcReg;
+ Register SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
SrcReg, isKill, ImplicitOp, LV))
@@ -947,7 +984,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
(Is64Bit ? X86::LEA64_32r : X86::LEA32r);
bool isKill;
- unsigned SrcReg;
+ Register SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
ImplicitOp, LV))
@@ -970,7 +1007,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
: (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
bool isKill;
- unsigned SrcReg;
+ Register SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
ImplicitOp, LV))
@@ -1005,7 +1042,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
bool isKill;
- unsigned SrcReg;
+ Register SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
SrcReg, isKill, ImplicitOp, LV))
@@ -1013,7 +1050,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
const MachineOperand &Src2 = MI.getOperand(2);
bool isKill2;
- unsigned SrcReg2;
+ Register SrcReg2;
MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
SrcReg2, isKill2, ImplicitOp2, LV))
@@ -1054,7 +1091,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
bool isKill;
- unsigned SrcReg;
+ Register SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
SrcReg, isKill, ImplicitOp, LV))
@@ -1085,6 +1122,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
return nullptr;
case X86::SUB32ri8:
case X86::SUB32ri: {
+ if (!MI.getOperand(2).isImm())
+ return nullptr;
int64_t Imm = MI.getOperand(2).getImm();
if (!isInt<32>(-Imm))
return nullptr;
@@ -1093,7 +1132,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
bool isKill;
- unsigned SrcReg;
+ Register SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
SrcReg, isKill, ImplicitOp, LV))
@@ -1111,6 +1150,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::SUB64ri8:
case X86::SUB64ri32: {
+ if (!MI.getOperand(2).isImm())
+ return nullptr;
int64_t Imm = MI.getOperand(2).getImm();
if (!isInt<32>(-Imm))
return nullptr;
@@ -1140,40 +1181,62 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
- case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: {
+ case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk:
+ case X86::VBROADCASTSDZ256mk:
+ case X86::VBROADCASTSDZmk:
+ case X86::VBROADCASTSSZ128mk:
+ case X86::VBROADCASTSSZ256mk:
+ case X86::VBROADCASTSSZmk:
+ case X86::VPBROADCASTDZ128mk:
+ case X86::VPBROADCASTDZ256mk:
+ case X86::VPBROADCASTDZmk:
+ case X86::VPBROADCASTQZ128mk:
+ case X86::VPBROADCASTQZ256mk:
+ case X86::VPBROADCASTQZmk: {
unsigned Opc;
switch (MIOpc) {
default: llvm_unreachable("Unreachable!");
- case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
- case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
- case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
- case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
- case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
- case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
- case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
- case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
- case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
- case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
- case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
- case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
- case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
- case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
- case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
- case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
- case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
- case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
- case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
- case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
- case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
- case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
- case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
- case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
- case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
- case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
- case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
- case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
- case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
- case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
+ case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
+ case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
+ case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
+ case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
+ case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
+ case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
+ case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+ case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+ case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
+ case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+ case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+ case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
+ case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
+ case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
+ case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
+ case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
+ case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
+ case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
+ case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+ case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+ case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
+ case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+ case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+ case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
+ case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
+ case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
+ case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
+ case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
+ case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
+ case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
+ case X86::VBROADCASTSDZ256mk: Opc = X86::VBLENDMPDZ256rmbk; break;
+ case X86::VBROADCASTSDZmk: Opc = X86::VBLENDMPDZrmbk; break;
+ case X86::VBROADCASTSSZ128mk: Opc = X86::VBLENDMPSZ128rmbk; break;
+ case X86::VBROADCASTSSZ256mk: Opc = X86::VBLENDMPSZ256rmbk; break;
+ case X86::VBROADCASTSSZmk: Opc = X86::VBLENDMPSZrmbk; break;
+ case X86::VPBROADCASTDZ128mk: Opc = X86::VPBLENDMDZ128rmbk; break;
+ case X86::VPBROADCASTDZ256mk: Opc = X86::VPBLENDMDZ256rmbk; break;
+ case X86::VPBROADCASTDZmk: Opc = X86::VPBLENDMDZrmbk; break;
+ case X86::VPBROADCASTQZ128mk: Opc = X86::VPBLENDMQZ128rmbk; break;
+ case X86::VPBROADCASTQZ256mk: Opc = X86::VPBLENDMQZ256rmbk; break;
+ case X86::VPBROADCASTQZmk: Opc = X86::VPBLENDMQZrmbk; break;
}
NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -1187,6 +1250,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
.add(MI.getOperand(7));
break;
}
+
case X86::VMOVDQU8Z128rrk:
case X86::VMOVDQU8Z256rrk:
case X86::VMOVDQU8Zrrk:
@@ -1683,6 +1747,27 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
+ case X86::VCMPSDZrr:
+ case X86::VCMPSSZrr:
+ case X86::VCMPPDZrri:
+ case X86::VCMPPSZrri:
+ case X86::VCMPPDZ128rri:
+ case X86::VCMPPSZ128rri:
+ case X86::VCMPPDZ256rri:
+ case X86::VCMPPSZ256rri:
+ case X86::VCMPPDZrrik:
+ case X86::VCMPPSZrrik:
+ case X86::VCMPPDZ128rrik:
+ case X86::VCMPPSZ128rrik:
+ case X86::VCMPPDZ256rrik:
+ case X86::VCMPPSZ256rrik: {
+ unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x1f;
+ Imm = X86::getSwappedVCMPImm(Imm);
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
case X86::VPERM2F128rr:
case X86::VPERM2I128rr: {
// Flip permute source immediate.
@@ -1859,7 +1944,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
// CommutableOpIdx2 is well defined now. Let's choose another commutable
// operand and assign its index to CommutableOpIdx1.
- unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
+ Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
unsigned CommutableOpIdx1;
for (CommutableOpIdx1 = LastCommutableVecOp;
@@ -1889,7 +1974,8 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
return true;
}
-bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {
const MCInstrDesc &Desc = MI.getDesc();
if (!Desc.isCommutable())
@@ -1926,17 +2012,23 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
switch (Imm) {
+ default:
+ // EVEX versions can be commuted.
+ if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
+ break;
+ return false;
case 0x00: // EQUAL
case 0x03: // UNORDERED
case 0x04: // NOT EQUAL
case 0x07: // ORDERED
- // The indices of the commutable operands are 1 and 2 (or 2 and 3
- // when masked).
- // Assign them to the returned operand indices here.
- return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
- 2 + OpOffset);
+ break;
}
- return false;
+
+ // The indices of the commutable operands are 1 and 2 (or 2 and 3
+ // when masked).
+ // Assign them to the returned operand indices here.
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
+ 2 + OpOffset);
}
case X86::MOVSSrr:
// X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
@@ -1990,6 +2082,24 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
case X86::VPTERNLOGQZ256rmbikz:
case X86::VPTERNLOGQZrmbikz:
return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ case X86::VPDPWSSDZ128r:
+ case X86::VPDPWSSDZ128rk:
+ case X86::VPDPWSSDZ128rkz:
+ case X86::VPDPWSSDZ256r:
+ case X86::VPDPWSSDZ256rk:
+ case X86::VPDPWSSDZ256rkz:
+ case X86::VPDPWSSDZr:
+ case X86::VPDPWSSDZrk:
+ case X86::VPDPWSSDZrkz:
+ case X86::VPDPWSSDSZ128r:
+ case X86::VPDPWSSDSZ128rk:
+ case X86::VPDPWSSDSZ128rkz:
+ case X86::VPDPWSSDSZ256r:
+ case X86::VPDPWSSDSZ256rk:
+ case X86::VPDPWSSDSZ256rkz:
+ case X86::VPDPWSSDSZr:
+ case X86::VPDPWSSDSZrk:
+ case X86::VPDPWSSDSZrkz:
case X86::VPMADD52HUQZ128r:
case X86::VPMADD52HUQZ128rk:
case X86::VPMADD52HUQZ128rkz:
@@ -2215,7 +2325,7 @@ unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) {
}
}
-/// Get the VPCMP immediate if the opcodes are swapped.
+/// Get the VPCMP immediate if the operands are swapped.
unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
switch (Imm) {
default: llvm_unreachable("Unreachable!");
@@ -2233,7 +2343,7 @@ unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
return Imm;
}
-/// Get the VPCOM immediate if the opcodes are swapped.
+/// Get the VPCOM immediate if the operands are swapped.
unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
switch (Imm) {
default: llvm_unreachable("Unreachable!");
@@ -2251,6 +2361,23 @@ unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
return Imm;
}
+/// Get the VCMP immediate if the operands are swapped.
+unsigned X86::getSwappedVCMPImm(unsigned Imm) {
+ // Only need the lower 2 bits to distinquish.
+ switch (Imm & 0x3) {
+ default: llvm_unreachable("Unreachable!");
+ case 0x00: case 0x03:
+ // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
+ break;
+ case 0x01: case 0x02:
+ // Need to toggle bits 3:0. Bit 4 stays the same.
+ Imm ^= 0xf;
+ break;
+ }
+
+ return Imm;
+}
+
bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
if (!MI.isTerminator()) return false;
@@ -3131,25 +3258,6 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(isKill));
}
-void X86InstrInfo::storeRegToAddr(
- MachineFunction &MF, unsigned SrcReg, bool isKill,
- SmallVectorImpl<MachineOperand> &Addr, const TargetRegisterClass *RC,
- ArrayRef<MachineMemOperand *> MMOs,
- SmallVectorImpl<MachineInstr *> &NewMIs) const {
- const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
- unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
- bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
- unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
- DebugLoc DL;
- MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
- for (unsigned i = 0, e = Addr.size(); i != e; ++i)
- MIB.add(Addr[i]);
- MIB.addReg(SrcReg, getKillRegState(isKill));
- MIB.setMemRefs(MMOs);
- NewMIs.push_back(MIB);
-}
-
-
void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned DestReg, int FrameIdx,
@@ -3164,23 +3272,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx);
}
-void X86InstrInfo::loadRegFromAddr(
- MachineFunction &MF, unsigned DestReg,
- SmallVectorImpl<MachineOperand> &Addr, const TargetRegisterClass *RC,
- ArrayRef<MachineMemOperand *> MMOs,
- SmallVectorImpl<MachineInstr *> &NewMIs) const {
- const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
- unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
- bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
- unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
- DebugLoc DL;
- MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
- for (unsigned i = 0, e = Addr.size(); i != e; ++i)
- MIB.add(Addr[i]);
- MIB.setMemRefs(MMOs);
- NewMIs.push_back(MIB);
-}
-
bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
unsigned &SrcReg2, int &CmpMask,
int &CmpValue) const {
@@ -3599,8 +3690,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
if (!IsCmpZero && !Sub)
return false;
- bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
- Sub->getOperand(2).getReg() == SrcReg);
+ bool IsSwapped =
+ (SrcReg2 != 0 && Sub && Sub->getOperand(1).getReg() == SrcReg2 &&
+ Sub->getOperand(2).getReg() == SrcReg);
// Scan forward from the instruction after CmpInstr for uses of EFLAGS.
// It is safe to remove CmpInstr if EFLAGS is redefined or killed.
@@ -3755,7 +3847,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg != FoldAsLoadDefReg)
continue;
// Do not fold if we have a subreg use or a def.
@@ -3785,7 +3877,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
const MCInstrDesc &Desc) {
assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
- unsigned Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB->getOperand(0).getReg();
MIB->setDesc(Desc);
// MachineInstr::addOperand() will insert explicit operands before any
@@ -3815,7 +3907,7 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
bool MinusOne) {
MachineBasicBlock &MBB = *MIB->getParent();
DebugLoc DL = MIB->getDebugLoc();
- unsigned Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB->getOperand(0).getReg();
// Insert the XOR.
BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
@@ -3891,7 +3983,7 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
const TargetInstrInfo &TII) {
MachineBasicBlock &MBB = *MIB->getParent();
DebugLoc DL = MIB->getDebugLoc();
- unsigned Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB->getOperand(0).getReg();
const GlobalValue *GV =
cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
auto Flags = MachineMemOperand::MOLoad |
@@ -3929,7 +4021,7 @@ static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
const MCInstrDesc &LoadDesc,
const MCInstrDesc &BroadcastDesc,
unsigned SubIdx) {
- unsigned DestReg = MIB->getOperand(0).getReg();
+ Register DestReg = MIB->getOperand(0).getReg();
// Check if DestReg is XMM16-31 or YMM16-31.
if (TRI->getEncodingValue(DestReg) < 16) {
// We can use a normal VEX encoded load.
@@ -3952,7 +4044,7 @@ static bool expandNOVLXStore(MachineInstrBuilder &MIB,
const MCInstrDesc &StoreDesc,
const MCInstrDesc &ExtractDesc,
unsigned SubIdx) {
- unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
+ Register SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
// Check if DestReg is XMM16-31 or YMM16-31.
if (TRI->getEncodingValue(SrcReg) < 16) {
// We can use a normal VEX encoded store.
@@ -4008,12 +4100,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::V_SET0:
case X86::FsFLD0SS:
case X86::FsFLD0SD:
+ case X86::FsFLD0F128:
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
case X86::AVX_SET0: {
assert(HasAVX && "AVX not supported");
const TargetRegisterInfo *TRI = &getRegisterInfo();
- unsigned SrcReg = MIB->getOperand(0).getReg();
- unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
+ Register SrcReg = MIB->getOperand(0).getReg();
+ Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
MIB->getOperand(0).setReg(XReg);
Expand2AddrUndef(MIB, get(X86::VXORPSrr));
MIB.addReg(SrcReg, RegState::ImplicitDefine);
@@ -4021,9 +4114,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
case X86::AVX512_128_SET0:
case X86::AVX512_FsFLD0SS:
- case X86::AVX512_FsFLD0SD: {
+ case X86::AVX512_FsFLD0SD:
+ case X86::AVX512_FsFLD0F128: {
bool HasVLX = Subtarget.hasVLX();
- unsigned SrcReg = MIB->getOperand(0).getReg();
+ Register SrcReg = MIB->getOperand(0).getReg();
const TargetRegisterInfo *TRI = &getRegisterInfo();
if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
return Expand2AddrUndef(MIB,
@@ -4037,10 +4131,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::AVX512_256_SET0:
case X86::AVX512_512_SET0: {
bool HasVLX = Subtarget.hasVLX();
- unsigned SrcReg = MIB->getOperand(0).getReg();
+ Register SrcReg = MIB->getOperand(0).getReg();
const TargetRegisterInfo *TRI = &getRegisterInfo();
if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
- unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
+ Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
MIB->getOperand(0).setReg(XReg);
Expand2AddrUndef(MIB,
get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
@@ -4060,14 +4154,14 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::AVX2_SETALLONES:
return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
case X86::AVX1_SETALLONES: {
- unsigned Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB->getOperand(0).getReg();
// VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
MIB->setDesc(get(X86::VCMPPSYrri));
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
return true;
}
case X86::AVX512_512_SETALLONES: {
- unsigned Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB->getOperand(0).getReg();
MIB->setDesc(get(X86::VPTERNLOGDZrri));
// VPTERNLOGD needs 3 register inputs and an immediate.
// 0xff will return 1s for any input.
@@ -4077,8 +4171,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
case X86::AVX512_512_SEXT_MASK_32:
case X86::AVX512_512_SEXT_MASK_64: {
- unsigned Reg = MIB->getOperand(0).getReg();
- unsigned MaskReg = MIB->getOperand(1).getReg();
+ Register Reg = MIB->getOperand(0).getReg();
+ Register MaskReg = MIB->getOperand(1).getReg();
unsigned MaskState = getRegState(MIB->getOperand(1));
unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
@@ -4115,8 +4209,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
case X86::MOV32ri64: {
- unsigned Reg = MIB->getOperand(0).getReg();
- unsigned Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
+ Register Reg = MIB->getOperand(0).getReg();
+ Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
MI.setDesc(get(X86::MOV32ri));
MIB->getOperand(0).setReg(Reg32);
MIB.addReg(Reg, RegState::ImplicitDefine);
@@ -4251,8 +4345,8 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
// If MI is marked as reading Reg, the partial register update is wanted.
const MachineOperand &MO = MI.getOperand(0);
- unsigned Reg = MO.getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ Register Reg = MO.getReg();
+ if (Register::isVirtualRegister(Reg)) {
if (MO.readsReg() || MI.readsVirtualRegister(Reg))
return 0;
} else {
@@ -4268,7 +4362,10 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
// Return true for any instruction the copies the high bits of the first source
// operand into the unused high bits of the destination operand.
-static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) {
+static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
+ bool ForLoadFold = false) {
+ // Set the OpNum parameter to the first source operand.
+ OpNum = 1;
switch (Opcode) {
case X86::VCVTSI2SSrr:
case X86::VCVTSI2SSrm:
@@ -4427,6 +4524,14 @@ static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) {
case X86::VSQRTSDZm:
case X86::VSQRTSDZm_Int:
return true;
+ case X86::VMOVSSZrrk:
+ case X86::VMOVSDZrrk:
+ OpNum = 3;
+ return true;
+ case X86::VMOVSSZrrkz:
+ case X86::VMOVSDZrrkz:
+ OpNum = 2;
+ return true;
}
return false;
@@ -4449,14 +4554,11 @@ static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) {
unsigned
X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
const TargetRegisterInfo *TRI) const {
- if (!hasUndefRegUpdate(MI.getOpcode()))
+ if (!hasUndefRegUpdate(MI.getOpcode(), OpNum))
return 0;
- // Set the OpNum parameter to the first source operand.
- OpNum = 1;
-
const MachineOperand &MO = MI.getOperand(OpNum);
- if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ if (MO.isUndef() && Register::isPhysicalRegister(MO.getReg())) {
return UndefRegClearance;
}
return 0;
@@ -4464,7 +4566,7 @@ X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
void X86InstrInfo::breakPartialRegDependency(
MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
- unsigned Reg = MI.getOperand(OpNum).getReg();
+ Register Reg = MI.getOperand(OpNum).getReg();
// If MI kills this register, the false dependence is already broken.
if (MI.killsRegister(Reg, TRI))
return;
@@ -4480,7 +4582,7 @@ void X86InstrInfo::breakPartialRegDependency(
} else if (X86::VR256RegClass.contains(Reg)) {
// Use vxorps to clear the full ymm register.
// It wants to read and write the xmm sub-register.
- unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+ Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
.addReg(XReg, RegState::Undef)
.addReg(XReg, RegState::Undef)
@@ -4489,7 +4591,7 @@ void X86InstrInfo::breakPartialRegDependency(
} else if (X86::GR64RegClass.contains(Reg)) {
// Using XOR32rr because it has shorter encoding and zeros up the upper bits
// as well.
- unsigned XReg = TRI->getSubReg(Reg, X86::sub_32bit);
+ Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
.addReg(XReg, RegState::Undef)
.addReg(XReg, RegState::Undef)
@@ -4538,8 +4640,8 @@ static void updateOperandRegConstraints(MachineFunction &MF,
// We only need to update constraints on virtual register operands.
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
- if (!TRI.isVirtualRegister(Reg))
+ Register Reg = MO.getReg();
+ if (!Register::isVirtualRegister(Reg))
continue;
auto *NewRC = MRI.constrainRegClass(
@@ -4698,7 +4800,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
MachineInstr &MI) {
- if (!hasUndefRegUpdate(MI.getOpcode(), /*ForLoadFold*/true) ||
+ unsigned Ignored;
+ if (!hasUndefRegUpdate(MI.getOpcode(), Ignored, /*ForLoadFold*/true) ||
!MI.getOperand(1).isReg())
return false;
@@ -4788,6 +4891,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
if (I != nullptr) {
unsigned Opcode = I->DstOp;
unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
+ MinAlign = MinAlign ? 1 << (MinAlign - 1) : 0;
if (Align < MinAlign)
return nullptr;
bool NarrowToMOV32rm = false;
@@ -4821,8 +4925,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
// If this is the special case where we use a MOV32rm to load a 32-bit
// value and zero-extend the top bits. Change the destination register
// to a 32-bit one.
- unsigned DstReg = NewMI->getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ Register DstReg = NewMI->getOperand(0).getReg();
+ if (Register::isPhysicalRegister(DstReg))
NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
else
NewMI->getOperand(0).setSubReg(X86::sub_32bit);
@@ -5133,6 +5237,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::V_SET0:
case X86::V_SETALLONES:
case X86::AVX512_128_SET0:
+ case X86::FsFLD0F128:
+ case X86::AVX512_FsFLD0F128:
Alignment = 16;
break;
case X86::MMX_SET0:
@@ -5182,7 +5288,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::FsFLD0SD:
case X86::AVX512_FsFLD0SD:
case X86::FsFLD0SS:
- case X86::AVX512_FsFLD0SS: {
+ case X86::AVX512_FsFLD0SS:
+ case X86::FsFLD0F128:
+ case X86::AVX512_FsFLD0F128: {
// Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
// Create a constant-pool entry and operands to load from it.
@@ -5212,6 +5320,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Ty = Type::getFloatTy(MF.getFunction().getContext());
else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD)
Ty = Type::getDoubleTy(MF.getFunction().getContext());
+ else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
+ Ty = Type::getFP128Ty(MF.getFunction().getContext());
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),16);
else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
@@ -5293,6 +5403,51 @@ extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
return StoreMMOs;
}
+static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I,
+ const TargetRegisterClass *RC,
+ const X86Subtarget &STI) {
+ assert(STI.hasAVX512() && "Expected at least AVX512!");
+ unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
+ assert((SpillSize == 64 || STI.hasVLX()) &&
+ "Can't broadcast less than 64 bytes without AVX512VL!");
+
+ switch (I->Flags & TB_BCAST_MASK) {
+ default: llvm_unreachable("Unexpected broadcast type!");
+ case TB_BCAST_D:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VPBROADCASTDZ128m;
+ case 32: return X86::VPBROADCASTDZ256m;
+ case 64: return X86::VPBROADCASTDZm;
+ }
+ break;
+ case TB_BCAST_Q:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VPBROADCASTQZ128m;
+ case 32: return X86::VPBROADCASTQZ256m;
+ case 64: return X86::VPBROADCASTQZm;
+ }
+ break;
+ case TB_BCAST_SS:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VBROADCASTSSZ128m;
+ case 32: return X86::VBROADCASTSSZ256m;
+ case 64: return X86::VBROADCASTSSZm;
+ }
+ break;
+ case TB_BCAST_SD:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VMOVDDUPZ128rm;
+ case 32: return X86::VBROADCASTSDZ256m;
+ case 64: return X86::VBROADCASTSDZm;
+ }
+ break;
+ }
+}
+
bool X86InstrInfo::unfoldMemoryOperand(
MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
@@ -5303,6 +5458,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
unsigned Index = I->Flags & TB_INDEX_MASK;
bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
bool FoldedStore = I->Flags & TB_FOLDED_STORE;
+ bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
if (UnfoldLoad && !FoldedLoad)
return false;
UnfoldLoad &= FoldedLoad;
@@ -5311,7 +5467,9 @@ bool X86InstrInfo::unfoldMemoryOperand(
UnfoldStore &= FoldedStore;
const MCInstrDesc &MCID = get(Opc);
+
const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
// TODO: Check if 32-byte or greater accesses are slow too?
if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
Subtarget.isUnalignedMem16Slow())
@@ -5335,10 +5493,26 @@ bool X86InstrInfo::unfoldMemoryOperand(
AfterOps.push_back(Op);
}
- // Emit the load instruction.
+ // Emit the load or broadcast instruction.
if (UnfoldLoad) {
auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
- loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs, NewMIs);
+
+ unsigned Opc;
+ if (FoldedBCast) {
+ Opc = getBroadcastOpcode(I, RC, Subtarget);
+ } else {
+ unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+ Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
+ }
+
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
+ for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
+ MIB.add(AddrOps[i]);
+ MIB.setMemRefs(MMOs);
+ NewMIs.push_back(MIB);
+
if (UnfoldStore) {
// Address operands cannot be marked isKill.
for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
@@ -5404,7 +5578,16 @@ bool X86InstrInfo::unfoldMemoryOperand(
if (UnfoldStore) {
const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
- storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs, NewMIs);
+ unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+ unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+ for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
+ MIB.add(AddrOps[i]);
+ MIB.addReg(Reg, RegState::Kill);
+ MIB.setMemRefs(MMOs);
+ NewMIs.push_back(MIB);
}
return true;
@@ -5423,6 +5606,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
unsigned Index = I->Flags & TB_INDEX_MASK;
bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
bool FoldedStore = I->Flags & TB_FOLDED_STORE;
+ bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
const MCInstrDesc &MCID = get(Opc);
MachineFunction &MF = DAG.getMachineFunction();
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
@@ -5456,10 +5640,17 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
return false;
// FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
// memory access is slow above.
- unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
- bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
- Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
- VT, MVT::Other, AddrOps);
+
+ unsigned Opc;
+ if (FoldedBCast) {
+ Opc = getBroadcastOpcode(I, RC, Subtarget);
+ } else {
+ unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+ Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
+ }
+
+ Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
NewNodes.push_back(Load);
// Preserve memory reference information.
@@ -7367,6 +7558,96 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
}
}
+Optional<ParamLoadedValue>
+X86InstrInfo::describeLoadedValue(const MachineInstr &MI) const {
+ const MachineOperand *Op = nullptr;
+ DIExpression *Expr = nullptr;
+
+ switch (MI.getOpcode()) {
+ case X86::LEA32r:
+ case X86::LEA64r:
+ case X86::LEA64_32r: {
+ // Operand 4 could be global address. For now we do not support
+ // such situation.
+ if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
+ return None;
+
+ const MachineOperand &Op1 = MI.getOperand(1);
+ const MachineOperand &Op2 = MI.getOperand(3);
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ assert(Op2.isReg() && (Op2.getReg() == X86::NoRegister ||
+ Register::isPhysicalRegister(Op2.getReg())));
+
+ // Omit situations like:
+ // %rsi = lea %rsi, 4, ...
+ if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) ||
+ Op2.getReg() == MI.getOperand(0).getReg())
+ return None;
+ else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister &&
+ TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) ||
+ (Op2.getReg() != X86::NoRegister &&
+ TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg())))
+ return None;
+
+ int64_t Coef = MI.getOperand(2).getImm();
+ int64_t Offset = MI.getOperand(4).getImm();
+ SmallVector<uint64_t, 8> Ops;
+
+ if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) {
+ Op = &Op1;
+ } else if (Op1.isFI())
+ Op = &Op1;
+
+ if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) {
+ Ops.push_back(dwarf::DW_OP_constu);
+ Ops.push_back(Coef + 1);
+ Ops.push_back(dwarf::DW_OP_mul);
+ } else {
+ if (Op && Op2.getReg() != X86::NoRegister) {
+ int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false);
+ if (dwarfReg < 0)
+ return None;
+ else if (dwarfReg < 32) {
+ Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg);
+ Ops.push_back(0);
+ } else {
+ Ops.push_back(dwarf::DW_OP_bregx);
+ Ops.push_back(dwarfReg);
+ Ops.push_back(0);
+ }
+ } else if (!Op) {
+ assert(Op2.getReg() != X86::NoRegister);
+ Op = &Op2;
+ }
+
+ if (Coef > 1) {
+ assert(Op2.getReg() != X86::NoRegister);
+ Ops.push_back(dwarf::DW_OP_constu);
+ Ops.push_back(Coef);
+ Ops.push_back(dwarf::DW_OP_mul);
+ }
+
+ if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) &&
+ Op2.getReg() != X86::NoRegister) {
+ Ops.push_back(dwarf::DW_OP_plus);
+ }
+ }
+
+ DIExpression::appendOffset(Ops, Offset);
+ Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops);
+
+ return ParamLoadedValue(*Op, Expr);;
+ }
+ case X86::XOR32rr: {
+ if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
+ return ParamLoadedValue(MachineOperand::CreateImm(0), Expr);
+ return None;
+ }
+ default:
+ return TargetInstrInfo::describeLoadedValue(MI);
+ }
+}
+
/// This is an architecture-specific helper function of reassociateOps.
/// Set special operand attributes for new instructions after reassociation.
void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
@@ -7500,9 +7781,8 @@ namespace {
// movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
// addq %rcx, %rax
// RAX now holds address of _GLOBAL_OFFSET_TABLE_.
- unsigned PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
- unsigned GOTReg =
- RegInfo.createVirtualRegister(&X86::GR64RegClass);
+ Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
+ Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
.addReg(X86::RIP)
.addImm(0)
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 13ca17139494..22b7b1d4cb19 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -67,6 +67,9 @@ unsigned getSwappedVPCMPImm(unsigned Imm);
/// Get the VPCOM immediate if the opcodes are swapped.
unsigned getSwappedVPCOMImm(unsigned Imm);
+/// Get the VCMP immediate if the opcodes are swapped.
+unsigned getSwappedVCMPImm(unsigned Imm);
+
} // namespace X86
/// isGlobalStubReference - Return true if the specified TargetFlag operand is
@@ -203,7 +206,7 @@ public:
int &FrameIndex) const override;
bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AliasAnalysis *AA) const override;
+ AAResults *AA) const override;
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
unsigned DestReg, unsigned SubIdx,
const MachineInstr &Orig,
@@ -218,7 +221,7 @@ public:
/// Reference parameters are set to indicate how caller should add this
/// operand to the LEA instruction.
bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
- unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
+ unsigned LEAOpcode, bool AllowSP, Register &NewSrc,
bool &isKill, MachineOperand &ImplicitOp,
LiveVariables *LV) const;
@@ -251,7 +254,7 @@ public:
/// findCommutedOpIndices(MI, Op1, Op2);
/// can be interpreted as a query asking to find an operand that would be
/// commutable with the operand#1.
- bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+ bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
/// Returns an adjusted FMA opcode that must be used in FMA instruction that
@@ -317,23 +320,11 @@ public:
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
- void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- ArrayRef<MachineMemOperand *> MMOs,
- SmallVectorImpl<MachineInstr *> &NewMIs) const;
-
void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, unsigned DestReg,
int FrameIndex, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
- void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- ArrayRef<MachineMemOperand *> MMOs,
- SmallVectorImpl<MachineInstr *> &NewMIs) const;
-
bool expandPostRAPseudo(MachineInstr &MI) const override;
/// Check whether the target can fold a load that feeds a subreg operand
@@ -527,6 +518,13 @@ public:
#define GET_INSTRINFO_HELPER_DECLS
#include "X86GenInstrInfo.inc"
+ static bool hasLockPrefix(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & X86II::LOCK;
+ }
+
+ Optional<ParamLoadedValue>
+ describeLoadedValue(const MachineInstr &MI) const override;
+
protected:
/// Commutes the operands in the given instruction by changing the operands
/// order and/or changing the instruction's opcode and/or the immediate value
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 8e05dd8ec5c1..e452145f3b65 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -673,6 +673,14 @@ def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
ImmSExti64i32AsmOperand];
}
+// 4-bit immediate used by some XOP instructions
+// [0, 0xF]
+def ImmUnsignedi4AsmOperand : AsmOperandClass {
+ let Name = "ImmUnsignedi4";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = "InvalidImmUnsignedi4";
+}
+
// Unsigned immediate used by SSE/AVX instructions
// [0, 0xFF]
// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
@@ -705,6 +713,13 @@ def i64i8imm : Operand<i64> {
let OperandType = "OPERAND_IMMEDIATE";
}
+// Unsigned 4-bit immediate used by some XOP instructions.
+def u4imm : Operand<i8> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi4AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
// Unsigned 8-bit immediate used by SSE/AVX instructions.
def u8imm : Operand<i8> {
let PrintMethod = "printU8Imm";
@@ -925,7 +940,6 @@ def HasMOVDIR64B : Predicate<"Subtarget->hasMOVDIR64B()">;
def HasPTWRITE : Predicate<"Subtarget->hasPTWRITE()">;
def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
-def HasMPX : Predicate<"Subtarget->hasMPX()">;
def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">;
def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
def HasCLWB : Predicate<"Subtarget->hasCLWB()">;
@@ -1103,7 +1117,7 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
if (ExtType == ISD::NON_EXTLOAD)
return true;
if (ExtType == ISD::EXTLOAD)
- return LD->getAlignment() >= 2 && !LD->isVolatile();
+ return LD->getAlignment() >= 2 && LD->isSimple();
return false;
}]>;
@@ -1113,7 +1127,7 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
if (ExtType == ISD::NON_EXTLOAD)
return true;
if (ExtType == ISD::EXTLOAD)
- return LD->getAlignment() >= 4 && !LD->isVolatile();
+ return LD->getAlignment() >= 4 && LD->isSimple();
return false;
}]>;
@@ -1170,7 +1184,7 @@ def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (unindexedload node:$ptr)), [
if (LD->getMemoryVT() == MVT::i32)
return true;
- return LD->getAlignment() >= 4 && !LD->isVolatile();
+ return LD->getAlignment() >= 4 && LD->isSimple();
}]>;
@@ -2404,25 +2418,26 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
}
multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
- RegisterClass RC, X86MemOperand x86memop> {
+ RegisterClass RC, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in {
def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
!strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
- T8PS, VEX_4V, Sched<[WriteBLS]>;
+ T8PS, VEX_4V, Sched<[sched]>;
let mayLoad = 1 in
def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
- T8PS, VEX_4V, Sched<[WriteBLS.Folded]>;
+ T8PS, VEX_4V, Sched<[sched.Folded]>;
}
}
let Predicates = [HasBMI], Defs = [EFLAGS] in {
- defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>;
- defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W;
- defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>;
- defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W;
- defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>;
- defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W;
+ defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS>;
+ defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS>, VEX_W;
+ defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS>;
+ defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS>, VEX_W;
+ defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS>;
+ defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS>, VEX_W;
}
//===----------------------------------------------------------------------===//
@@ -2683,12 +2698,12 @@ def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst",
multiclass lwpins_intr<RegisterClass RC> {
def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
"lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))]>,
+ [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, timm:$cntl))]>,
XOP_4V, XOPA;
let mayLoad = 1 in
def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
"lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))]>,
+ [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), timm:$cntl))]>,
XOP_4V, XOPA;
}
@@ -2700,11 +2715,11 @@ let Defs = [EFLAGS] in {
multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
"lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(Int RC:$src0, GR32:$src1, imm:$cntl)]>, XOP_4V, XOPA;
+ [(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP_4V, XOPA;
let mayLoad = 1 in
def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
"lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)]>,
+ [(Int RC:$src0, (loadi32 addr:$src1), timm:$cntl)]>,
XOP_4V, XOPA;
}
@@ -3205,13 +3220,13 @@ def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
// Likewise for btc/btr/bts.
def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}",
- (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
+ (BT32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}",
- (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
+ (BTC32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}",
- (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
+ (BTR32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}",
- (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
+ (BTS32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
// clr aliases.
def : InstAlias<"clr{b}\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 57835b1a256a..cd9a866c91cb 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -30,7 +30,6 @@ def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>;
let Constraints = "$src1 = $dst" in {
// MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
- // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
X86FoldableSchedWrite sched, bit Commutable = 0,
X86MemOperand OType = i64mem> {
@@ -67,7 +66,7 @@ let Constraints = "$src1 = $dst" in {
def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
(ins VR64:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))]>,
+ [(set VR64:$dst, (IntId2 VR64:$src1, timm:$src2))]>,
Sched<[schedImm]>;
}
}
@@ -114,13 +113,13 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId,
def rri : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2, u8imm:$src3),
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>,
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 timm:$src3)))]>,
Sched<[sched]>;
def rmi : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2, u8imm:$src3),
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR64:$dst, (IntId VR64:$src1,
- (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>,
+ (bitconvert (load_mmx addr:$src2)), (i8 timm:$src3)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -496,14 +495,14 @@ def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
(outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
"pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR64:$dst,
- (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>,
+ (int_x86_sse_pshuf_w VR64:$src1, timm:$src2))]>,
Sched<[SchedWriteShuffle.MMX]>;
def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
(outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2),
"pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR64:$dst,
(int_x86_sse_pshuf_w (load_mmx addr:$src1),
- imm:$src2))]>,
+ timm:$src2))]>,
Sched<[SchedWriteShuffle.MMX.Folded]>;
// -- Conversion Instructions
@@ -535,7 +534,7 @@ def MMX_PEXTRWrr: MMXIi8<0xC5, MRMSrcReg,
(outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
- imm:$src2))]>,
+ timm:$src2))]>,
Sched<[WriteVecExtract]>;
let Constraints = "$src1 = $dst" in {
let Predicates = [HasMMX, HasSSE1] in {
@@ -544,7 +543,7 @@ let Predicates = [HasMMX, HasSSE1] in {
(ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
- GR32orGR64:$src2, imm:$src3))]>,
+ GR32orGR64:$src2, timm:$src3))]>,
Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem,
@@ -553,7 +552,7 @@ let Predicates = [HasMMX, HasSSE1] in {
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
(i32 (anyext (loadi16 addr:$src2))),
- imm:$src3))]>,
+ timm:$src3))]>,
Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
}
}
@@ -567,6 +566,13 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
(int_x86_mmx_pmovmskb VR64:$src))]>,
Sched<[WriteMMXMOVMSK]>;
+// MMX to XMM for vector types
+def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
+ [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
+
+def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
+ (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+
// Low word of XMM to MMX.
def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
[SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
@@ -574,9 +580,13 @@ def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
(x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
-def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
+def : Pat<(x86mmx (MMX_X86movdq2q (v2i64 (simple_load addr:$src)))),
(x86mmx (MMX_MOVQ64rm addr:$src))>;
+def : Pat<(v2i64 (X86vzmovl (scalar_to_vector
+ (i64 (bitconvert (x86mmx VR64:$src)))))),
+ (MMX_MOVQ2DQrr VR64:$src)>;
+
// Misc.
let SchedRW = [SchedWriteShuffle.MMX] in {
let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in
@@ -602,9 +612,6 @@ def : Pat<(x86mmx (MMX_X86movdq2q
(bc_v2i64 (v4i32 (X86cvttp2si (v4f32 VR128:$src)))))),
(MMX_CVTTPS2PIirr VR128:$src)>;
def : Pat<(x86mmx (MMX_X86movdq2q
- (bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))),
- (MMX_CVTTPS2PIirr VR128:$src)>;
-def : Pat<(x86mmx (MMX_X86movdq2q
(bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
(MMX_CVTPD2PIirr VR128:$src)>;
def : Pat<(x86mmx (MMX_X86movdq2q
diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td
index f7d931510fe2..44ba071947c2 100644
--- a/lib/Target/X86/X86InstrMPX.td
+++ b/lib/Target/X86/X86InstrMPX.td
@@ -12,16 +12,16 @@
//
//===----------------------------------------------------------------------===//
-// FIXME: Investigate a better scheduler class once MPX is used inside LLVM.
+// FIXME: Investigate a better scheduler class if MPX is ever used inside LLVM.
let SchedRW = [WriteSystem] in {
multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
- Requires<[HasMPX, Not64BitMode]>;
+ Requires<[Not64BitMode]>;
def 64rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
- Requires<[HasMPX, In64BitMode]>;
+ Requires<[In64BitMode]>;
}
defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
@@ -29,17 +29,17 @@ defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2),
OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
- Requires<[HasMPX, Not64BitMode]>;
+ Requires<[Not64BitMode]>;
def 64rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2),
OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
- Requires<[HasMPX, In64BitMode]>;
+ Requires<[In64BitMode]>;
def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2),
OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
- Requires<[HasMPX, Not64BitMode]>;
+ Requires<[Not64BitMode]>;
def 64rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2),
OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
- Requires<[HasMPX, In64BitMode]>;
+ Requires<[In64BitMode]>;
}
defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS, NotMemoryFoldable;
defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD, NotMemoryFoldable;
@@ -47,33 +47,31 @@ defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD, NotMemoryFoldable;
def BNDMOVrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
"bndmov\t{$src, $dst|$dst, $src}", []>, PD,
- Requires<[HasMPX]>, NotMemoryFoldable;
+ NotMemoryFoldable;
let mayLoad = 1 in {
def BNDMOV32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
"bndmov\t{$src, $dst|$dst, $src}", []>, PD,
- Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable;
+ Requires<[Not64BitMode]>, NotMemoryFoldable;
def BNDMOV64rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
"bndmov\t{$src, $dst|$dst, $src}", []>, PD,
- Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable;
+ Requires<[In64BitMode]>, NotMemoryFoldable;
}
let isCodeGenOnly = 1, ForceDisassemble = 1 in
def BNDMOVrr_REV : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
"bndmov\t{$src, $dst|$dst, $src}", []>, PD,
- Requires<[HasMPX]>, NotMemoryFoldable;
+ NotMemoryFoldable;
let mayStore = 1 in {
def BNDMOV32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
"bndmov\t{$src, $dst|$dst, $src}", []>, PD,
- Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable;
+ Requires<[Not64BitMode]>, NotMemoryFoldable;
def BNDMOV64mr : I<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
"bndmov\t{$src, $dst|$dst, $src}", []>, PD,
- Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable;
+ Requires<[In64BitMode]>, NotMemoryFoldable;
def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins anymem:$dst, BNDR:$src),
- "bndstx\t{$src, $dst|$dst, $src}", []>, PS,
- Requires<[HasMPX]>;
+ "bndstx\t{$src, $dst|$dst, $src}", []>, PS;
}
let mayLoad = 1 in
def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
- "bndldx\t{$src, $dst|$dst, $src}", []>, PS,
- Requires<[HasMPX]>;
+ "bndldx\t{$src, $dst|$dst, $src}", []>, PS;
} // SchedRW
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 7d0a5b87baf4..09a04c0338b4 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -115,7 +115,9 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
[(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
- [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
+ [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
+ def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
}
//===----------------------------------------------------------------------===//
@@ -128,13 +130,18 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-zeros value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
- isPseudo = 1, SchedRW = [WriteZero] in {
+ isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4f32 immAllZerosV))]>;
}
-let Predicates = [NoAVX512] in
+let Predicates = [NoAVX512] in {
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+}
// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
@@ -147,6 +154,14 @@ def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v8i32 immAllZerosV))]>;
}
+let Predicates = [NoAVX512] in {
+def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+}
+
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-ones value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -355,7 +370,7 @@ defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd
defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
SSEPackedSingle, SchedWriteFMoveLS.YMM>,
PS, VEX, VEX_L, VEX_WIG;
-defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
SSEPackedDouble, SchedWriteFMoveLS.YMM>,
PD, VEX, VEX_L, VEX_WIG;
}
@@ -661,7 +676,7 @@ let Predicates = [UseSSE1] in {
// This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
// end up with a movsd or blend instead of shufp.
// No need for aligned load, we're only loading 64-bits.
- def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1,
+ def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
(i8 -28)),
(MOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
@@ -727,7 +742,7 @@ let Predicates = [UseSSE1] in {
// This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
// end up with a movsd or blend instead of shufp.
// No need for aligned load, we're only loading 64-bits.
- def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))),
+ def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
@@ -761,7 +776,7 @@ let Predicates = [UseSSE2] in {
let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
// Use MOVLPD to load into the low bits from a full vector unless we can use
// BLENDPD.
- def : Pat<(X86Movsd VR128:$src1, (v2f64 (nonvolatile_load addr:$src2))),
+ def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
(MOVLPDrm VR128:$src1, addr:$src2)>;
}
@@ -1713,12 +1728,12 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
let isCommutable = 1 in
def rr : SIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
- [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
+ [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>,
Sched<[sched]>;
def rm : SIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1),
- (ld_frag addr:$src2), imm:$cc))]>,
+ (ld_frag addr:$src2), timm:$cc))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -1751,13 +1766,13 @@ multiclass sse12_cmp_scalar_int<Operand memop,
def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
- VR128:$src, imm:$cc))]>,
+ VR128:$src, timm:$cc))]>,
Sched<[sched]>;
let mayLoad = 1 in
def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, memop:$src, u8imm:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
- mem_cpat:$src, imm:$cc))]>,
+ mem_cpat:$src, timm:$cc))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -1876,12 +1891,12 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
let isCommutable = 1 in
def rri : PIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
- [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
+ [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
Sched<[sched]>;
def rmi : PIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
[(set RC:$dst,
- (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
+ (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -1906,7 +1921,7 @@ let Constraints = "$src1 = $dst" in {
SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
}
-def CommutableCMPCC : PatLeaf<(imm), [{
+def CommutableCMPCC : PatLeaf<(timm), [{
uint64_t Imm = N->getZExtValue() & 0x7;
return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
}]>;
@@ -1915,47 +1930,47 @@ def CommutableCMPCC : PatLeaf<(imm), [{
let Predicates = [HasAVX] in {
def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1,
CommutableCMPCC:$cc)),
- (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+ (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1,
CommutableCMPCC:$cc)),
- (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+ (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1,
CommutableCMPCC:$cc)),
- (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+ (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1,
CommutableCMPCC:$cc)),
- (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
+ (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
CommutableCMPCC:$cc)),
- (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
+ (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
CommutableCMPCC:$cc)),
- (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
+ (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
}
let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1,
CommutableCMPCC:$cc)),
- (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+ (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
CommutableCMPCC:$cc)),
- (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
+ (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
}
let Predicates = [UseSSE1] in {
def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1,
CommutableCMPCC:$cc)),
- (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
+ (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
CommutableCMPCC:$cc)),
- (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
+ (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
}
//===----------------------------------------------------------------------===//
@@ -1970,13 +1985,13 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
- (i8 imm:$src3))))], d>,
+ (i8 timm:$src3))))], d>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
let isCommutable = IsCommutable in
def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
- (i8 imm:$src3))))], d>,
+ (i8 timm:$src3))))], d>,
Sched<[sched]>;
}
@@ -2097,7 +2112,7 @@ let Predicates = [HasAVX1Only] in {
let Predicates = [UseSSE2] in {
// Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
- (v2f64 (nonvolatile_load addr:$src2)))),
+ (v2f64 (simple_load addr:$src2)))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
}
@@ -2721,7 +2736,7 @@ defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64,
defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
-
+
/// Unop Arithmetic
/// In addition, we also have a special variant of the scalar form here to
/// represent the associated intrinsic operation. This form is unlike the
@@ -3482,7 +3497,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>,
+ [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
Sched<[schedImm]>;
}
@@ -3514,7 +3529,7 @@ multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+ [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
Sched<[sched]>;
}
@@ -3597,7 +3612,7 @@ let Predicates = [HasAVX, prd] in {
!strconcat("v", OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
+ (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
VEX, Sched<[sched.XMM]>, VEX_WIG;
def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, u8imm:$src2),
@@ -3605,7 +3620,7 @@ let Predicates = [HasAVX, prd] in {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (load addr:$src1),
- (i8 imm:$src2))))]>, VEX,
+ (i8 timm:$src2))))]>, VEX,
Sched<[sched.XMM.Folded]>, VEX_WIG;
}
@@ -3615,7 +3630,7 @@ let Predicates = [HasAVX2, prd] in {
!strconcat("v", OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
- (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>,
+ (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
(ins i256mem:$src1, u8imm:$src2),
@@ -3623,7 +3638,7 @@ let Predicates = [HasAVX2, prd] in {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(vt256 (OpNode (load addr:$src1),
- (i8 imm:$src2))))]>, VEX, VEX_L,
+ (i8 timm:$src2))))]>, VEX, VEX_L,
Sched<[sched.YMM.Folded]>, VEX_WIG;
}
@@ -3633,7 +3648,7 @@ let Predicates = [UseSSE2] in {
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
+ (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
Sched<[sched.XMM]>;
def mi : Ii8<0x70, MRMSrcMem,
(outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
@@ -3641,7 +3656,7 @@ let Predicates = [UseSSE2] in {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (memop addr:$src1),
- (i8 imm:$src2))))]>,
+ (i8 timm:$src2))))]>,
Sched<[sched.XMM.Folded]>;
}
}
@@ -4380,7 +4395,7 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
+ def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
@@ -4388,7 +4403,7 @@ let Predicates = [HasAVX, NoVLX] in {
let Predicates = [UseSSE3] in {
// No need for aligned memory as this only loads 64-bits.
- def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
+ def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
(MOVDDUPrm addr:$src)>;
def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(MOVDDUPrm addr:$src)>;
@@ -4812,7 +4827,7 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>,
+ [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
Sched<[sched]>;
let mayLoad = 1 in
def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
@@ -4823,7 +4838,7 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst, (VT (X86PAlignr RC:$src1,
(memop_frag addr:$src2),
- (i8 imm:$src3))))]>,
+ (i8 timm:$src3))))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -5300,7 +5315,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>,
+ (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
Sched<[SchedWriteFShuffle.XMM]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f32mem:$src2, u8imm:$src3),
@@ -5311,7 +5326,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
[(set VR128:$dst,
(X86insertps VR128:$src1,
(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
- imm:$src3))]>,
+ timm:$src3))]>,
Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
}
@@ -5323,17 +5338,6 @@ let ExeDomain = SSEPackedSingle in {
defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
}
-let Predicates = [UseAVX] in {
- // If we're inserting an element from a vbroadcast of a load, fold the
- // load into the X86insertps instruction.
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
- (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
- (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
- (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
- (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
-}
-
//===----------------------------------------------------------------------===//
// SSE4.1 - Round Instructions
//===----------------------------------------------------------------------===//
@@ -5348,7 +5352,7 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
(outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>,
+ [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
Sched<[sched]>;
// Vector intrinsic operation, mem
@@ -5357,13 +5361,13 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
- (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>,
+ (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
Sched<[sched.Folded]>;
}
multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
string OpcodeStr, X86FoldableSchedWrite sched> {
-let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
@@ -5378,7 +5382,7 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
[]>, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
-let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
def SDr : SS4AIi8<opcsd, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
@@ -5396,7 +5400,7 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
string OpcodeStr, X86FoldableSchedWrite sched> {
-let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
@@ -5411,7 +5415,7 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
[]>, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
-let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
def SDr : SS4AIi8<opcsd, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
@@ -5431,7 +5435,7 @@ multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
string OpcodeStr, X86FoldableSchedWrite sched,
ValueType VT32, ValueType VT64,
SDNode OpNode, bit Is2Addr = 1> {
-let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedSingle in {
def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -5439,7 +5443,7 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
"ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
+ [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
Sched<[sched]>;
def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
@@ -5450,11 +5454,11 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+ (OpNode VR128:$src1, sse_load_f32:$src2, timm:$src3))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
-let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedDouble in {
def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -5462,7 +5466,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
"sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
+ [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
Sched<[sched]>;
def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
@@ -5473,7 +5477,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+ (OpNode VR128:$src1, sse_load_f64:$src2, timm:$src3))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
}
@@ -5508,17 +5512,17 @@ let Predicates = [UseAVX] in {
}
let Predicates = [UseAVX] in {
- def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, imm:$src2)>;
- def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale FR32:$src1, timm:$src2),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
+ def : Pat<(X86VRndScale FR64:$src1, timm:$src2),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
}
let Predicates = [UseAVX, OptForSize] in {
- def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
- def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2),
+ (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+ def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2),
+ (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
}
let ExeDomain = SSEPackedSingle in
@@ -5535,17 +5539,17 @@ defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
v4f32, v2f64, X86RndScales>;
let Predicates = [UseSSE41] in {
- def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
- (ROUNDSSr FR32:$src1, imm:$src2)>;
- def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
- (ROUNDSDr FR64:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale FR32:$src1, timm:$src2),
+ (ROUNDSSr FR32:$src1, timm:$src2)>;
+ def : Pat<(X86VRndScale FR64:$src1, timm:$src2),
+ (ROUNDSDr FR64:$src1, timm:$src2)>;
}
let Predicates = [UseSSE41, OptForSize] in {
- def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
- (ROUNDSSm addr:$src1, imm:$src2)>;
- def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
- (ROUNDSDm addr:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2),
+ (ROUNDSSm addr:$src1, timm:$src2)>;
+ def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2),
+ (ROUNDSDm addr:$src1, timm:$src2)>;
}
//===----------------------------------------------------------------------===//
@@ -5826,7 +5830,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
Sched<[sched]>;
def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
@@ -5836,7 +5840,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst,
- (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>,
+ (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -5853,7 +5857,7 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
Sched<[sched]>;
def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
@@ -5863,27 +5867,27 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-def BlendCommuteImm2 : SDNodeXForm<imm, [{
+def BlendCommuteImm2 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue() & 0x03;
return getI8Imm(Imm ^ 0x03, SDLoc(N));
}]>;
-def BlendCommuteImm4 : SDNodeXForm<imm, [{
+def BlendCommuteImm4 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue() & 0x0f;
return getI8Imm(Imm ^ 0x0f, SDLoc(N));
}]>;
-def BlendCommuteImm8 : SDNodeXForm<imm, [{
+def BlendCommuteImm8 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue() & 0xff;
return getI8Imm(Imm ^ 0xff, SDLoc(N));
}]>;
// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
-def BlendScaleImm4 : SDNodeXForm<imm, [{
+def BlendScaleImm4 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 4; ++i) {
@@ -5894,7 +5898,7 @@ def BlendScaleImm4 : SDNodeXForm<imm, [{
}]>;
// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
-def BlendScaleImm2 : SDNodeXForm<imm, [{
+def BlendScaleImm2 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 2; ++i) {
@@ -5905,7 +5909,7 @@ def BlendScaleImm2 : SDNodeXForm<imm, [{
}]>;
// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
-def BlendScaleImm2to4 : SDNodeXForm<imm, [{
+def BlendScaleImm2to4 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 2; ++i) {
@@ -5916,7 +5920,7 @@ def BlendScaleImm2to4 : SDNodeXForm<imm, [{
}]>;
// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
-def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
+def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 4; ++i) {
@@ -5927,7 +5931,7 @@ def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
}]>;
// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
-def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
+def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 2; ++i) {
@@ -5938,7 +5942,7 @@ def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
}]>;
// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
-def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{
+def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 2; ++i) {
@@ -6008,7 +6012,7 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
Sched<[sched]>;
def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
@@ -6018,14 +6022,14 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
// Pattern to commute if load is in first source.
- def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)),
+ def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
(!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
- (commuteXForm imm:$src3))>;
+ (commuteXForm timm:$src3))>;
}
let Predicates = [HasAVX] in {
@@ -6061,37 +6065,37 @@ let Predicates = [HasAVX2] in {
// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
// ExecutionDomainFixPass will cleanup domains later on.
let Predicates = [HasAVX1Only] in {
-def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
- (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
- (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
- (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
+ (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
+ (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
+ (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
// it from becoming movsd via commuting under optsize.
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
- (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
- (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
-def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
- (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
-
-def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3),
- (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3),
- (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3),
- (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>;
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
+ (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
+
+def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
+ (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
+def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
+ (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
+def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
+ (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
// it from becoming movss via commuting under optsize.
-def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
- (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3),
- (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3),
- (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
+ (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
}
defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
@@ -6107,19 +6111,19 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
let Predicates = [UseSSE41] in {
// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
// it from becoming movss via commuting under optsize.
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
- (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3),
- (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
-def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3),
- (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
+ (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
-def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
- (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3),
- (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3),
- (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
+ (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
}
// For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -6592,7 +6596,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
"sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
- (i8 imm:$src3)))]>, TA,
+ (i8 timm:$src3)))]>, TA,
Sched<[SchedWriteVecIMul.XMM]>;
def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
@@ -6600,7 +6604,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
[(set VR128:$dst,
(int_x86_sha1rnds4 VR128:$src1,
(memop addr:$src2),
- (i8 imm:$src3)))]>, TA,
+ (i8 timm:$src3)))]>, TA,
Sched<[SchedWriteVecIMul.XMM.Folded,
SchedWriteVecIMul.XMM.ReadAfterFold]>;
@@ -6718,26 +6722,26 @@ let Predicates = [HasAVX, HasAES] in {
(ins VR128:$src1, u8imm:$src2),
"vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, u8imm:$src2),
"vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>,
+ (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
}
def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, u8imm:$src2),
"aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
Sched<[WriteAESKeyGen]>;
def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, u8imm:$src2),
"aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>,
+ (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
Sched<[WriteAESKeyGen.Folded]>;
//===----------------------------------------------------------------------===//
@@ -6745,7 +6749,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
//===----------------------------------------------------------------------===//
// Immediate transform to help with commuting.
-def PCLMULCommuteImm : SDNodeXForm<imm, [{
+def PCLMULCommuteImm : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
}]>;
@@ -6758,7 +6762,7 @@ let Predicates = [NoAVX, HasPCLMUL] in {
(ins VR128:$src1, VR128:$src2, u8imm:$src3),
"pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
- (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
Sched<[WriteCLMul]>;
def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
@@ -6766,14 +6770,14 @@ let Predicates = [NoAVX, HasPCLMUL] in {
"pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
- imm:$src3))]>,
+ timm:$src3))]>,
Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
} // Constraints = "$src1 = $dst"
def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
- (i8 imm:$src3)),
+ (i8 timm:$src3)),
(PCLMULQDQrm VR128:$src1, addr:$src2,
- (PCLMULCommuteImm imm:$src3))>;
+ (PCLMULCommuteImm timm:$src3))>;
} // Predicates = [NoAVX, HasPCLMUL]
// SSE aliases
@@ -6795,21 +6799,21 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
(ins RC:$src1, RC:$src2, u8imm:$src3),
"vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set RC:$dst,
- (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+ (IntId RC:$src1, RC:$src2, timm:$src3))]>,
Sched<[WriteCLMul]>;
def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, MemOp:$src2, u8imm:$src3),
"vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set RC:$dst,
- (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>,
+ (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
// We can commute a load in the first operand by swapping the sources and
// rotating the immediate.
- def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)),
+ def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
(!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
- (PCLMULCommuteImm imm:$src3))>;
+ (PCLMULCommuteImm timm:$src3))>;
}
let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
@@ -6853,8 +6857,8 @@ let Constraints = "$src = $dst" in {
def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
(ins VR128:$src, u8imm:$len, u8imm:$idx),
"extrq\t{$idx, $len, $src|$src, $len, $idx}",
- [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
- imm:$idx))]>,
+ [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
+ timm:$idx))]>,
PD, Sched<[SchedWriteVecALU.XMM]>;
def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src, VR128:$mask),
@@ -6867,7 +6871,7 @@ def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
"insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
[(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
- imm:$len, imm:$idx))]>,
+ timm:$len, timm:$idx))]>,
XD, Sched<[SchedWriteVecALU.XMM]>;
def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src, VR128:$mask),
@@ -6907,10 +6911,10 @@ def : Pat<(nontemporalstore FR64:$src, addr:$dst),
//
class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
X86MemOperand x86memop, ValueType VT,
- PatFrag ld_frag, SchedWrite Sched> :
+ PatFrag bcast_frag, SchedWrite Sched> :
AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
+ [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
Sched<[Sched]>, VEX;
// AVX2 adds register forms
@@ -6923,15 +6927,15 @@ class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
- f32mem, v4f32, loadf32,
+ f32mem, v4f32, X86VBroadcastld32,
SchedWriteFShuffle.XMM.Folded>;
def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
- f32mem, v8f32, loadf32,
+ f32mem, v8f32, X86VBroadcastld32,
SchedWriteFShuffle.XMM.Folded>, VEX_L;
}
let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
- v4f64, loadf64,
+ v4f64, X86VBroadcastld64,
SchedWriteFShuffle.XMM.Folded>, VEX_L;
let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
@@ -6944,15 +6948,6 @@ let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
v4f64, v2f64, WriteFShuffle256>, VEX_L;
-let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
- (VBROADCASTSSrm addr:$src)>;
- def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
- (VBROADCASTSSYrm addr:$src)>;
- def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
- (VBROADCASTSDYrm addr:$src)>;
-}
-
//===----------------------------------------------------------------------===//
// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
// halves of a 256-bit vector.
@@ -7081,27 +7076,29 @@ let Predicates = [HasAVX1Only] in {
//
multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
Intrinsic IntLd, Intrinsic IntLd256,
- Intrinsic IntSt, Intrinsic IntSt256> {
+ Intrinsic IntSt, Intrinsic IntSt256,
+ X86SchedWriteMaskMove schedX,
+ X86SchedWriteMaskMove schedY> {
def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
- VEX_4V, Sched<[WriteFMaskedLoad]>;
+ VEX_4V, Sched<[schedX.RM]>;
def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
- VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>;
+ VEX_4V, VEX_L, Sched<[schedY.RM]>;
def mr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
- VEX_4V, Sched<[WriteFMaskedStore]>;
+ VEX_4V, Sched<[schedX.MR]>;
def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f256mem:$dst, VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
- VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>;
+ VEX_4V, VEX_L, Sched<[schedY.MR]>;
}
let ExeDomain = SSEPackedSingle in
@@ -7109,13 +7106,15 @@ defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
int_x86_avx_maskload_ps,
int_x86_avx_maskload_ps_256,
int_x86_avx_maskstore_ps,
- int_x86_avx_maskstore_ps_256>;
+ int_x86_avx_maskstore_ps_256,
+ WriteFMaskMove32, WriteFMaskMove32Y>;
let ExeDomain = SSEPackedDouble in
defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
int_x86_avx_maskload_pd,
int_x86_avx_maskload_pd_256,
int_x86_avx_maskstore_pd,
- int_x86_avx_maskstore_pd_256>;
+ int_x86_avx_maskstore_pd_256,
+ WriteFMaskMove64, WriteFMaskMove64Y>;
//===----------------------------------------------------------------------===//
// VPERMIL - Permute Single and Double Floating-Point Values
@@ -7143,13 +7142,13 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
+ [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
Sched<[sched]>;
def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
(ins x86memop_f:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
- (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
+ (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
Sched<[sched.Folded]>;
}// Predicates = [HasAVX, NoVLX]
}
@@ -7181,38 +7180,38 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
- (i8 imm:$src3))))]>, VEX_4V, VEX_L,
+ (i8 timm:$src3))))]>, VEX_4V, VEX_L,
Sched<[WriteFShuffle256]>;
def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
- (i8 imm:$src3)))]>, VEX_4V, VEX_L,
+ (i8 timm:$src3)))]>, VEX_4V, VEX_L,
Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
}
// Immediate transform to help with commuting.
-def Perm2XCommuteImm : SDNodeXForm<imm, [{
+def Perm2XCommuteImm : SDNodeXForm<timm, [{
return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
}]>;
let Predicates = [HasAVX] in {
// Pattern with load in other operand.
def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
- VR256:$src1, (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
+ VR256:$src1, (i8 timm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
}
let Predicates = [HasAVX1Only] in {
-def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>;
def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
- (loadv4i64 addr:$src2), (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+ (loadv4i64 addr:$src2), (i8 timm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>;
// Pattern with load in other operand.
def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
- VR256:$src1, (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
+ VR256:$src1, (i8 timm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
}
//===----------------------------------------------------------------------===//
@@ -7257,7 +7256,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
(ins RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
+ [(set VR128:$dst, (X86cvtps2ph RC:$src1, timm:$src2))]>,
TAPD, VEX, Sched<[RR]>;
let hasSideEffects = 0, mayStore = 1 in
def mr : Ii8<0x1D, MRMDestMem, (outs),
@@ -7282,15 +7281,15 @@ let Predicates = [HasF16C, NoVLX] in {
(VCVTPH2PSrm addr:$src)>;
def : Pat<(store (f64 (extractelt
- (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+ (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
(iPTR 0))), addr:$dst),
- (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
def : Pat<(store (i64 (extractelt
- (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+ (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
(iPTR 0))), addr:$dst),
- (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
- def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
- (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
+ def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
+ (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
}
// Patterns for matching conversions from float to half-float and vice versa.
@@ -7327,20 +7326,20 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins RC:$src1, RC:$src2, u8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
Sched<[sched]>, VEX_4V;
def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>,
+ (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
// Pattern to commute if load is in first source.
- def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)),
+ def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
(!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
- (commuteXForm imm:$src3))>;
+ (commuteXForm timm:$src3))>;
}
let Predicates = [HasAVX2] in {
@@ -7351,19 +7350,19 @@ defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
SchedWriteBlend.YMM, VR256, i256mem,
BlendCommuteImm8>, VEX_L;
-def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
- (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
- (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
- (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
+ (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
+ (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
+ (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
- (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
- (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>;
-def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
- (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>;
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
+ (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
+ (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
+ (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
}
// For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -7407,7 +7406,7 @@ def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0
// destination operand
//
multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
- X86MemOperand x86memop, PatFrag ld_frag,
+ X86MemOperand x86memop, PatFrag bcast_frag,
ValueType OpVT128, ValueType OpVT256, Predicate prd> {
let Predicates = [HasAVX2, prd] in {
def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -7418,7 +7417,7 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
- (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
+ (OpVT128 (bcast_frag addr:$src)))]>,
Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -7428,7 +7427,7 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst,
- (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
+ (OpVT256 (bcast_frag addr:$src)))]>,
Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
// Provide aliases for broadcast from the same register class that
@@ -7439,13 +7438,13 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
}
}
-defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
+defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
v16i8, v32i8, NoVLX_Or_NoBWI>;
-defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
+defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
v8i16, v16i16, NoVLX_Or_NoBWI>;
-defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
+defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
v4i32, v8i32, NoVLX>;
-defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
+defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
v2i64, v4i64, NoVLX>;
let Predicates = [HasAVX2, NoVLX] in {
@@ -7455,14 +7454,11 @@ let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQYrm addr:$src)>;
- def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ // FIXME this is to handle aligned extloads from i8/i16.
+ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
(VPBROADCASTDrm addr:$src)>;
- def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
(VPBROADCASTDYrm addr:$src)>;
- def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
- (VPBROADCASTQrm addr:$src)>;
- def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
- (VPBROADCASTQYrm addr:$src)>;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
@@ -7483,17 +7479,12 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWYrm addr:$src)>;
-}
-let Predicates = [HasAVX2, NoVLX] in {
- // Provide aliases for broadcast from the same register class that
- // automatically does the extract.
- def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
- (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
- sub_xmm)))>;
- def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
- (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
- sub_xmm)))>;
+ // FIXME this is to handle aligned extloads from i8.
+ def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
+ (VPBROADCASTWYrm addr:$src)>;
}
let Predicates = [HasAVX2, NoVLX] in {
@@ -7509,45 +7500,41 @@ let Predicates = [HasAVX2, NoVLX] in {
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
- (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS
+ (VPBROADCASTBrr (VMOVDI2PDIrr
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR8:$src, sub_8bit)),
- VR128)))>;
+ GR8:$src, sub_8bit))))>;
def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
- (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS
+ (VPBROADCASTBYrr (VMOVDI2PDIrr
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR8:$src, sub_8bit)),
- VR128)))>;
+ GR8:$src, sub_8bit))))>;
def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
- (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS
+ (VPBROADCASTWrr (VMOVDI2PDIrr
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR16:$src, sub_16bit)),
- VR128)))>;
+ GR16:$src, sub_16bit))))>;
def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
- (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS
+ (VPBROADCASTWYrr (VMOVDI2PDIrr
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR16:$src, sub_16bit)),
- VR128)))>;
+ GR16:$src, sub_16bit))))>;
}
let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
- (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
+ (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
- (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
+ (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
- (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
+ (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
- (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
+ (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
}
// AVX1 broadcast patterns
let Predicates = [HasAVX1Only] in {
-def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
(VBROADCASTSSYrm addr:$src)>;
-def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
(VBROADCASTSDYrm addr:$src)>;
-def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
(VBROADCASTSSrm addr:$src)>;
}
@@ -7557,12 +7544,12 @@ let Predicates = [HasAVX, NoVLX] in {
// 128bit broadcasts:
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
- def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
(VMOVDDUPrm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
(VMOVDDUPrr VR128:$src)>;
- def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+ def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
(VMOVDDUPrm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
(VMOVDDUPrm addr:$src)>;
@@ -7581,19 +7568,19 @@ let Predicates = [HasAVX1Only] in {
(v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
- (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>;
+ (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
(VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm),
- (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>;
+ (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
+ (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
(VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
- (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm),
- (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>;
+ (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
+ (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
def : Pat<(v2i64 (X86VBroadcast i64:$src)),
- (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>;
- def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
+ def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
(VMOVDDUPrm addr:$src)>;
}
@@ -7636,7 +7623,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
- (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+ (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
Sched<[Sched]>, VEX, VEX_L;
def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
(ins memOp:$src1, u8imm:$src2),
@@ -7644,7 +7631,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(OpVT (X86VPermi (mem_frag addr:$src1),
- (i8 imm:$src2))))]>,
+ (i8 timm:$src2))))]>,
Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
}
}
@@ -7663,19 +7650,19 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
"vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
- (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
+ (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>,
VEX_4V, VEX_L;
def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
"vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
- (i8 imm:$src3)))]>,
+ (i8 timm:$src3)))]>,
Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
let Predicates = [HasAVX2] in
def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
- VR256:$src1, (i8 imm:$imm))),
- (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
+ VR256:$src1, (i8 timm:$imm))),
+ (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
//===----------------------------------------------------------------------===//
@@ -7760,7 +7747,7 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
int_x86_avx2_maskstore_q_256>, VEX_W;
multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
- ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
+ ValueType MaskVT> {
// masked store
def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
(!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
@@ -7772,23 +7759,23 @@ multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
}
let Predicates = [HasAVX] in {
- defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
- defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
- defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
- defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
+ defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
+ defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
+ defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
+ defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
}
let Predicates = [HasAVX1Only] in {
// load/store i32/i64 not supported use ps/pd version
- defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
- defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
- defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
- defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+ defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
+ defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
+ defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
+ defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
}
let Predicates = [HasAVX2] in {
- defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
- defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
- defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
- defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+ defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
+ defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
+ defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
+ defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
}
//===----------------------------------------------------------------------===//
@@ -7956,13 +7943,13 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3), "",
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
[(set RC:$dst, (OpVT (OpNode RC:$src1,
(MemOpFrag addr:$src2),
- imm:$src3)))], SSEPackedInt>,
+ timm:$src3)))], SSEPackedInt>,
Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
}
}
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 7050e1917494..7f41feb6c0d9 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -43,7 +43,7 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>;
let SchedRW = [WriteSystem] in {
def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
- [(int_x86_int imm:$trap)]>;
+ [(int_x86_int timm:$trap)]>;
def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB;
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index fc0da845299f..3a1212342a13 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -45,7 +45,7 @@ def XTEST : I<0x01, MRM_D6, (outs), (ins),
def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
"xabort\t$imm",
- [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
+ [(int_x86_xabort timm:$imm)]>, Requires<[HasRTM]>;
} // SchedRW
// HLE prefixes
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 66ca78556b82..229af366d940 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -143,13 +143,13 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>,
+ (vt128 (OpNode (vt128 VR128:$src1), timm:$src2)))]>,
XOP, Sched<[sched]>;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (vt128 (OpNode (vt128 (load addr:$src1)), imm:$src2)))]>,
+ (vt128 (OpNode (vt128 (load addr:$src1)), timm:$src2)))]>,
XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -251,7 +251,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
"\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
- imm:$cc)))]>,
+ timm:$cc)))]>,
XOP_4V, Sched<[sched]>;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, u8imm:$cc),
@@ -260,14 +260,14 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1),
(vt128 (load addr:$src2)),
- imm:$cc)))]>,
+ timm:$cc)))]>,
XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
def : Pat<(OpNode (load addr:$src2),
- (vt128 VR128:$src1), imm:$cc),
+ (vt128 VR128:$src1), timm:$cc),
(!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2,
- (CommuteVPCOMCC imm:$cc))>;
+ (CommuteVPCOMCC timm:$cc))>;
}
defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8, SchedWriteVecALU.XMM>;
@@ -418,27 +418,27 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag,
X86FoldableSchedWrite sched> {
def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
+ (ins RC:$src1, RC:$src2, RC:$src3, u4imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set RC:$dst,
- (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>,
+ (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 timm:$src4))))]>,
Sched<[sched]>;
def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4),
+ (ins RC:$src1, RC:$src2, intmemop:$src3, u4imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set RC:$dst,
(VT (X86vpermil2 RC:$src1, RC:$src2, (IntLdFrag addr:$src3),
- (i8 imm:$src4))))]>, VEX_W,
+ (i8 timm:$src4))))]>, VEX_W,
Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4),
+ (ins RC:$src1, fpmemop:$src2, RC:$src3, u4imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set RC:$dst,
(VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2),
- RC:$src3, (i8 imm:$src4))))]>,
+ RC:$src3, (i8 timm:$src4))))]>,
Sched<[sched.Folded, sched.ReadAfterFold,
// fpmemop:$src2
ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -447,7 +447,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
+ (ins RC:$src1, RC:$src2, RC:$src3, u4imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[]>, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>;
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 892a083f4d1a..01620b7b64c9 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -60,7 +60,7 @@ public:
X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI,
const X86RegisterBankInfo &RBI);
- bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ bool select(MachineInstr &I) override;
static const char *getName() { return DEBUG_TYPE; }
private:
@@ -94,11 +94,9 @@ private:
MachineFunction &MF) const;
bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
- MachineFunction &MF,
- CodeGenCoverage &CoverageInfo) const;
+ MachineFunction &MF);
bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
- MachineFunction &MF,
- CodeGenCoverage &CoverageInfo) const;
+ MachineFunction &MF);
bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -217,7 +215,7 @@ static unsigned getSubRegIndex(const TargetRegisterClass *RC) {
}
static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) {
- assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ assert(Register::isPhysicalRegister(Reg));
if (X86::GR64RegClass.contains(Reg))
return &X86::GR64RegClass;
if (X86::GR32RegClass.contains(Reg))
@@ -233,15 +231,15 @@ static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) {
// Set X86 Opcode and constrain DestReg.
bool X86InstructionSelector::selectCopy(MachineInstr &I,
MachineRegisterInfo &MRI) const {
- unsigned DstReg = I.getOperand(0).getReg();
+ Register DstReg = I.getOperand(0).getReg();
const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
- unsigned SrcReg = I.getOperand(1).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
- if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+ if (Register::isPhysicalRegister(DstReg)) {
assert(I.isCopy() && "Generic operators do not allow physical registers");
if (DstSize > SrcSize && SrcRegBank.getID() == X86::GPRRegBankID &&
@@ -253,7 +251,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
if (SrcRC != DstRC) {
// This case can be generated by ABI lowering, performe anyext
- unsigned ExtSrc = MRI.createVirtualRegister(DstRC);
+ Register ExtSrc = MRI.createVirtualRegister(DstRC);
BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(TargetOpcode::SUBREG_TO_REG))
.addDef(ExtSrc)
@@ -268,12 +266,12 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
return true;
}
- assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
+ assert((!Register::isPhysicalRegister(SrcReg) || I.isCopy()) &&
"No phys reg on generic operators");
assert((DstSize == SrcSize ||
// Copies are a mean to setup initial types, the number of
// bits may not exactly match.
- (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+ (Register::isPhysicalRegister(SrcReg) &&
DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) &&
"Copy with different width?!");
@@ -282,7 +280,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
if (SrcRegBank.getID() == X86::GPRRegBankID &&
DstRegBank.getID() == X86::GPRRegBankID && SrcSize > DstSize &&
- TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ Register::isPhysicalRegister(SrcReg)) {
// Change the physical register to performe truncate.
const TargetRegisterClass *SrcRC = getRegClassFromGRPhysReg(SrcReg);
@@ -308,8 +306,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
return true;
}
-bool X86InstructionSelector::select(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
+bool X86InstructionSelector::select(MachineInstr &I) {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -333,7 +330,7 @@ bool X86InstructionSelector::select(MachineInstr &I,
assert(I.getNumOperands() == I.getNumExplicitOperands() &&
"Generic instruction has unexpected implicit operands\n");
- if (selectImpl(I, CoverageInfo))
+ if (selectImpl(I, *CoverageInfo))
return true;
LLVM_DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
@@ -370,10 +367,10 @@ bool X86InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_UADDE:
return selectUadde(I, MRI, MF);
case TargetOpcode::G_UNMERGE_VALUES:
- return selectUnmergeValues(I, MRI, MF, CoverageInfo);
+ return selectUnmergeValues(I, MRI, MF);
case TargetOpcode::G_MERGE_VALUES:
case TargetOpcode::G_CONCAT_VECTORS:
- return selectMergeValues(I, MRI, MF, CoverageInfo);
+ return selectMergeValues(I, MRI, MF);
case TargetOpcode::G_EXTRACT:
return selectExtract(I, MRI, MF);
case TargetOpcode::G_INSERT:
@@ -512,7 +509,7 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
assert((Opc == TargetOpcode::G_STORE || Opc == TargetOpcode::G_LOAD) &&
"unexpected instruction");
- const unsigned DefReg = I.getOperand(0).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
@@ -572,7 +569,7 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I,
assert((Opc == TargetOpcode::G_FRAME_INDEX || Opc == TargetOpcode::G_GEP) &&
"unexpected instruction");
- const unsigned DefReg = I.getOperand(0).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
// Use LEA to calculate frame index and GEP
@@ -625,7 +622,7 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I,
AM.Base.Reg = X86::RIP;
}
- const unsigned DefReg = I.getOperand(0).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
unsigned NewOpc = getLeaOP(Ty, STI);
@@ -644,7 +641,7 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I,
assert((I.getOpcode() == TargetOpcode::G_CONSTANT) &&
"unexpected instruction");
- const unsigned DefReg = I.getOperand(0).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
if (RBI.getRegBank(DefReg, MRI, TRI)->getID() != X86::GPRRegBankID)
@@ -717,8 +714,8 @@ bool X86InstructionSelector::selectTruncOrPtrToInt(MachineInstr &I,
I.getOpcode() == TargetOpcode::G_PTRTOINT) &&
"unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg);
@@ -781,8 +778,8 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
MachineFunction &MF) const {
assert((I.getOpcode() == TargetOpcode::G_ZEXT) && "unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg);
@@ -892,8 +889,8 @@ bool X86InstructionSelector::selectAnyext(MachineInstr &I,
MachineFunction &MF) const {
assert((I.getOpcode() == TargetOpcode::G_ANYEXT) && "unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg);
@@ -952,8 +949,8 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I,
std::tie(CC, SwapArgs) = X86::getX86ConditionCode(
(CmpInst::Predicate)I.getOperand(1).getPredicate());
- unsigned LHS = I.getOperand(2).getReg();
- unsigned RHS = I.getOperand(3).getReg();
+ Register LHS = I.getOperand(2).getReg();
+ Register RHS = I.getOperand(3).getReg();
if (SwapArgs)
std::swap(LHS, RHS);
@@ -998,8 +995,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
MachineFunction &MF) const {
assert((I.getOpcode() == TargetOpcode::G_FCMP) && "unexpected instruction");
- unsigned LhsReg = I.getOperand(2).getReg();
- unsigned RhsReg = I.getOperand(3).getReg();
+ Register LhsReg = I.getOperand(2).getReg();
+ Register RhsReg = I.getOperand(3).getReg();
CmpInst::Predicate Predicate =
(CmpInst::Predicate)I.getOperand(1).getPredicate();
@@ -1033,7 +1030,7 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
break;
}
- unsigned ResultReg = I.getOperand(0).getReg();
+ Register ResultReg = I.getOperand(0).getReg();
RBI.constrainGenericRegister(
ResultReg,
*getRegClass(LLT::scalar(8), *RBI.getRegBank(ResultReg, MRI, TRI)), MRI);
@@ -1043,8 +1040,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
.addReg(LhsReg)
.addReg(RhsReg);
- unsigned FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass);
- unsigned FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass);
+ Register FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass);
+ Register FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass);
MachineInstr &Set1 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(X86::SETCCr), FlagReg1).addImm(SETFOpc[0]);
MachineInstr &Set2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -1089,11 +1086,11 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I,
MachineFunction &MF) const {
assert((I.getOpcode() == TargetOpcode::G_UADDE) && "unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned CarryOutReg = I.getOperand(1).getReg();
- const unsigned Op0Reg = I.getOperand(2).getReg();
- const unsigned Op1Reg = I.getOperand(3).getReg();
- unsigned CarryInReg = I.getOperand(4).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register CarryOutReg = I.getOperand(1).getReg();
+ const Register Op0Reg = I.getOperand(2).getReg();
+ const Register Op1Reg = I.getOperand(3).getReg();
+ Register CarryInReg = I.getOperand(4).getReg();
const LLT DstTy = MRI.getType(DstReg);
@@ -1149,8 +1146,8 @@ bool X86InstructionSelector::selectExtract(MachineInstr &I,
assert((I.getOpcode() == TargetOpcode::G_EXTRACT) &&
"unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
int64_t Index = I.getOperand(2).getImm();
const LLT DstTy = MRI.getType(DstReg);
@@ -1281,9 +1278,9 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I,
MachineFunction &MF) const {
assert((I.getOpcode() == TargetOpcode::G_INSERT) && "unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
- const unsigned InsertReg = I.getOperand(2).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
+ const Register InsertReg = I.getOperand(2).getReg();
int64_t Index = I.getOperand(3).getImm();
const LLT DstTy = MRI.getType(DstReg);
@@ -1335,14 +1332,13 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I,
}
bool X86InstructionSelector::selectUnmergeValues(
- MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF,
- CodeGenCoverage &CoverageInfo) const {
+ MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) {
assert((I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES) &&
"unexpected instruction");
// Split to extracts.
unsigned NumDefs = I.getNumOperands() - 1;
- unsigned SrcReg = I.getOperand(NumDefs).getReg();
+ Register SrcReg = I.getOperand(NumDefs).getReg();
unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
for (unsigned Idx = 0; Idx < NumDefs; ++Idx) {
@@ -1352,7 +1348,7 @@ bool X86InstructionSelector::selectUnmergeValues(
.addReg(SrcReg)
.addImm(Idx * DefSize);
- if (!select(ExtrInst, CoverageInfo))
+ if (!select(ExtrInst))
return false;
}
@@ -1361,15 +1357,14 @@ bool X86InstructionSelector::selectUnmergeValues(
}
bool X86InstructionSelector::selectMergeValues(
- MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF,
- CodeGenCoverage &CoverageInfo) const {
+ MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) {
assert((I.getOpcode() == TargetOpcode::G_MERGE_VALUES ||
I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS) &&
"unexpected instruction");
// Split to inserts.
- unsigned DstReg = I.getOperand(0).getReg();
- unsigned SrcReg0 = I.getOperand(1).getReg();
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg0 = I.getOperand(1).getReg();
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg0);
@@ -1378,13 +1373,13 @@ bool X86InstructionSelector::selectMergeValues(
const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
// For the first src use insertSubReg.
- unsigned DefReg = MRI.createGenericVirtualRegister(DstTy);
+ Register DefReg = MRI.createGenericVirtualRegister(DstTy);
MRI.setRegBank(DefReg, RegBank);
if (!emitInsertSubreg(DefReg, I.getOperand(1).getReg(), I, MRI, MF))
return false;
for (unsigned Idx = 2; Idx < I.getNumOperands(); ++Idx) {
- unsigned Tmp = MRI.createGenericVirtualRegister(DstTy);
+ Register Tmp = MRI.createGenericVirtualRegister(DstTy);
MRI.setRegBank(Tmp, RegBank);
MachineInstr &InsertInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -1395,7 +1390,7 @@ bool X86InstructionSelector::selectMergeValues(
DefReg = Tmp;
- if (!select(InsertInst, CoverageInfo))
+ if (!select(InsertInst))
return false;
}
@@ -1403,7 +1398,7 @@ bool X86InstructionSelector::selectMergeValues(
TII.get(TargetOpcode::COPY), DstReg)
.addReg(DefReg);
- if (!select(CopyInst, CoverageInfo))
+ if (!select(CopyInst))
return false;
I.eraseFromParent();
@@ -1415,7 +1410,7 @@ bool X86InstructionSelector::selectCondBranch(MachineInstr &I,
MachineFunction &MF) const {
assert((I.getOpcode() == TargetOpcode::G_BRCOND) && "unexpected instruction");
- const unsigned CondReg = I.getOperand(0).getReg();
+ const Register CondReg = I.getOperand(0).getReg();
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
MachineInstr &TestInst =
@@ -1442,7 +1437,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I,
if (CM != CodeModel::Small && CM != CodeModel::Large)
return false;
- const unsigned DstReg = I.getOperand(0).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
const LLT DstTy = MRI.getType(DstReg);
const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
unsigned Align = DstTy.getSizeInBits();
@@ -1460,7 +1455,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I,
// Under X86-64 non-small code model, GV (and friends) are 64-bits, so
// they cannot be folded into immediate fields.
- unsigned AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass);
+ Register AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass);
BuildMI(*I.getParent(), I, DbgLoc, TII.get(X86::MOV64ri), AddrReg)
.addConstantPoolIndex(CPI, 0, OpFlag);
@@ -1503,7 +1498,7 @@ bool X86InstructionSelector::selectImplicitDefOrPHI(
I.getOpcode() == TargetOpcode::G_PHI) &&
"unexpected instruction");
- unsigned DstReg = I.getOperand(0).getReg();
+ Register DstReg = I.getOperand(0).getReg();
if (!MRI.getRegClassOrNull(DstReg)) {
const LLT DstTy = MRI.getType(DstReg);
@@ -1537,7 +1532,7 @@ bool X86InstructionSelector::selectShift(MachineInstr &I,
I.getOpcode() == TargetOpcode::G_LSHR) &&
"unexpected instruction");
- unsigned DstReg = I.getOperand(0).getReg();
+ Register DstReg = I.getOperand(0).getReg();
const LLT DstTy = MRI.getType(DstReg);
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
@@ -1578,8 +1573,8 @@ bool X86InstructionSelector::selectShift(MachineInstr &I,
return false;
}
- unsigned Op0Reg = I.getOperand(1).getReg();
- unsigned Op1Reg = I.getOperand(2).getReg();
+ Register Op0Reg = I.getOperand(1).getReg();
+ Register Op1Reg = I.getOperand(2).getReg();
assert(MRI.getType(Op1Reg).getSizeInBits() == 8);
@@ -1606,9 +1601,9 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
I.getOpcode() == TargetOpcode::G_UREM) &&
"unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned Op1Reg = I.getOperand(1).getReg();
- const unsigned Op2Reg = I.getOperand(2).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register Op1Reg = I.getOperand(1).getReg();
+ const Register Op2Reg = I.getOperand(2).getReg();
const LLT RegTy = MRI.getType(DstReg);
assert(RegTy == MRI.getType(Op1Reg) && RegTy == MRI.getType(Op2Reg) &&
@@ -1732,7 +1727,7 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(OpEntry.OpSignExtend));
else {
- unsigned Zero32 = MRI.createVirtualRegister(&X86::GR32RegClass);
+ Register Zero32 = MRI.createVirtualRegister(&X86::GR32RegClass);
BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::MOV32r0),
Zero32);
@@ -1770,8 +1765,8 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
if ((I.getOpcode() == Instruction::SRem ||
I.getOpcode() == Instruction::URem) &&
OpEntry.DivRemResultReg == X86::AH && STI.is64Bit()) {
- unsigned SourceSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
- unsigned ResultSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+ Register SourceSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+ Register ResultSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Copy), SourceSuperReg)
.addReg(X86::AX);
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 40141d894629..1d7adbaa9e99 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -23,7 +23,7 @@ enum IntrinsicType : uint16_t {
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
INTR_TYPE_3OP_IMM8,
- CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV,
+ CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI,
CVTPD2PS_MASK,
INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE,
INTR_TYPE_1OP_MASK_SAE, INTR_TYPE_2OP_MASK_SAE, INTR_TYPE_3OP_MASK_SAE,
@@ -1101,8 +1101,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB),
X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB),
- X86_INTRINSIC_DATA(tbm_bextri_u32, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
- X86_INTRINSIC_DATA(tbm_bextri_u64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
+ X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTR, 0),
+ X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 00fb1b573858..04121f863c89 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -13,6 +13,7 @@
#include "X86LegalizerInfo.h"
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
@@ -84,6 +85,24 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
verify(*STI.getInstrInfo());
}
+bool X86LegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ switch (MI.getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ if (createMemLibcall(MIRBuilder, MRI, MI) ==
+ LegalizerHelper::UnableToLegalize)
+ return false;
+ MI.eraseFromParent();
+ return true;
+ default:
+ break;
+ }
+ return true;
+}
+
void X86LegalizerInfo::setLegalizerInfo32bit() {
const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0));
@@ -158,6 +177,7 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_ANYEXT, Ty}, Legal);
}
setAction({G_ANYEXT, s128}, Legal);
+ getActionDefinitionsBuilder(G_SEXT_INREG).lower();
// Comparison
setAction({G_ICMP, s1}, Legal);
diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h
index d21707b9ab9b..7a0f13fb5ae6 100644
--- a/lib/Target/X86/X86LegalizerInfo.h
+++ b/lib/Target/X86/X86LegalizerInfo.h
@@ -32,6 +32,9 @@ private:
public:
X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM);
+ bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const override;
+
private:
void setLegalizerInfo32bit();
void setLegalizerInfo64bit();
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index b1fefaa84be4..78098fd6262f 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -427,6 +427,41 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
}
}
+// Replace TAILJMP opcodes with their equivalent opcodes that have encoding
+// information.
+static unsigned convertTailJumpOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::TAILJMPr:
+ Opcode = X86::JMP32r;
+ break;
+ case X86::TAILJMPm:
+ Opcode = X86::JMP32m;
+ break;
+ case X86::TAILJMPr64:
+ Opcode = X86::JMP64r;
+ break;
+ case X86::TAILJMPm64:
+ Opcode = X86::JMP64m;
+ break;
+ case X86::TAILJMPr64_REX:
+ Opcode = X86::JMP64r_REX;
+ break;
+ case X86::TAILJMPm64_REX:
+ Opcode = X86::JMP64m_REX;
+ break;
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64:
+ Opcode = X86::JMP_1;
+ break;
+ case X86::TAILJMPd_CC:
+ case X86::TAILJMPd64_CC:
+ Opcode = X86::JCC_1;
+ break;
+ }
+
+ return Opcode;
+}
+
void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.setOpcode(MI->getOpcode());
@@ -500,21 +535,190 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
break;
}
- // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
- // inputs modeled as normal uses instead of implicit uses. As such, truncate
- // off all but the first operand (the callee). FIXME: Change isel.
- case X86::TAILJMPr64:
- case X86::TAILJMPr64_REX:
- case X86::CALL64r:
- case X86::CALL64pcrel32: {
- unsigned Opcode = OutMI.getOpcode();
- MCOperand Saved = OutMI.getOperand(0);
- OutMI = MCInst();
- OutMI.setOpcode(Opcode);
- OutMI.addOperand(Saved);
+ case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rmik:
+ case X86::VPCMPBZ128rri: case X86::VPCMPBZ128rrik:
+ case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rmik:
+ case X86::VPCMPBZ256rri: case X86::VPCMPBZ256rrik:
+ case X86::VPCMPBZrmi: case X86::VPCMPBZrmik:
+ case X86::VPCMPBZrri: case X86::VPCMPBZrrik:
+ case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rmik:
+ case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+ case X86::VPCMPDZ128rri: case X86::VPCMPDZ128rrik:
+ case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rmik:
+ case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+ case X86::VPCMPDZ256rri: case X86::VPCMPDZ256rrik:
+ case X86::VPCMPDZrmi: case X86::VPCMPDZrmik:
+ case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
+ case X86::VPCMPDZrri: case X86::VPCMPDZrrik:
+ case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rmik:
+ case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+ case X86::VPCMPQZ128rri: case X86::VPCMPQZ128rrik:
+ case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rmik:
+ case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+ case X86::VPCMPQZ256rri: case X86::VPCMPQZ256rrik:
+ case X86::VPCMPQZrmi: case X86::VPCMPQZrmik:
+ case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
+ case X86::VPCMPQZrri: case X86::VPCMPQZrrik:
+ case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rmik:
+ case X86::VPCMPWZ128rri: case X86::VPCMPWZ128rrik:
+ case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rmik:
+ case X86::VPCMPWZ256rri: case X86::VPCMPWZ256rrik:
+ case X86::VPCMPWZrmi: case X86::VPCMPWZrmik:
+ case X86::VPCMPWZrri: case X86::VPCMPWZrrik: {
+ // Turn immediate 0 into the VPCMPEQ instruction.
+ if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 0) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPEQBZ128rm; break;
+ case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPEQBZ128rmk; break;
+ case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPEQBZ128rr; break;
+ case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPEQBZ128rrk; break;
+ case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPEQBZ256rm; break;
+ case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPEQBZ256rmk; break;
+ case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPEQBZ256rr; break;
+ case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPEQBZ256rrk; break;
+ case X86::VPCMPBZrmi: NewOpc = X86::VPCMPEQBZrm; break;
+ case X86::VPCMPBZrmik: NewOpc = X86::VPCMPEQBZrmk; break;
+ case X86::VPCMPBZrri: NewOpc = X86::VPCMPEQBZrr; break;
+ case X86::VPCMPBZrrik: NewOpc = X86::VPCMPEQBZrrk; break;
+ case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPEQDZ128rm; break;
+ case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPEQDZ128rmb; break;
+ case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPEQDZ128rmbk; break;
+ case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPEQDZ128rmk; break;
+ case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPEQDZ128rr; break;
+ case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPEQDZ128rrk; break;
+ case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPEQDZ256rm; break;
+ case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPEQDZ256rmb; break;
+ case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPEQDZ256rmbk; break;
+ case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPEQDZ256rmk; break;
+ case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPEQDZ256rr; break;
+ case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPEQDZ256rrk; break;
+ case X86::VPCMPDZrmi: NewOpc = X86::VPCMPEQDZrm; break;
+ case X86::VPCMPDZrmib: NewOpc = X86::VPCMPEQDZrmb; break;
+ case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPEQDZrmbk; break;
+ case X86::VPCMPDZrmik: NewOpc = X86::VPCMPEQDZrmk; break;
+ case X86::VPCMPDZrri: NewOpc = X86::VPCMPEQDZrr; break;
+ case X86::VPCMPDZrrik: NewOpc = X86::VPCMPEQDZrrk; break;
+ case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPEQQZ128rm; break;
+ case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPEQQZ128rmb; break;
+ case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPEQQZ128rmbk; break;
+ case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPEQQZ128rmk; break;
+ case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPEQQZ128rr; break;
+ case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPEQQZ128rrk; break;
+ case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPEQQZ256rm; break;
+ case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPEQQZ256rmb; break;
+ case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPEQQZ256rmbk; break;
+ case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPEQQZ256rmk; break;
+ case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPEQQZ256rr; break;
+ case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPEQQZ256rrk; break;
+ case X86::VPCMPQZrmi: NewOpc = X86::VPCMPEQQZrm; break;
+ case X86::VPCMPQZrmib: NewOpc = X86::VPCMPEQQZrmb; break;
+ case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPEQQZrmbk; break;
+ case X86::VPCMPQZrmik: NewOpc = X86::VPCMPEQQZrmk; break;
+ case X86::VPCMPQZrri: NewOpc = X86::VPCMPEQQZrr; break;
+ case X86::VPCMPQZrrik: NewOpc = X86::VPCMPEQQZrrk; break;
+ case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPEQWZ128rm; break;
+ case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPEQWZ128rmk; break;
+ case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPEQWZ128rr; break;
+ case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPEQWZ128rrk; break;
+ case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPEQWZ256rm; break;
+ case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPEQWZ256rmk; break;
+ case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPEQWZ256rr; break;
+ case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPEQWZ256rrk; break;
+ case X86::VPCMPWZrmi: NewOpc = X86::VPCMPEQWZrm; break;
+ case X86::VPCMPWZrmik: NewOpc = X86::VPCMPEQWZrmk; break;
+ case X86::VPCMPWZrri: NewOpc = X86::VPCMPEQWZrr; break;
+ case X86::VPCMPWZrrik: NewOpc = X86::VPCMPEQWZrrk; break;
+ }
+
+ OutMI.setOpcode(NewOpc);
+ OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1));
+ break;
+ }
+
+ // Turn immediate 6 into the VPCMPGT instruction.
+ if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 6) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPGTBZ128rm; break;
+ case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPGTBZ128rmk; break;
+ case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPGTBZ128rr; break;
+ case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPGTBZ128rrk; break;
+ case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPGTBZ256rm; break;
+ case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPGTBZ256rmk; break;
+ case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPGTBZ256rr; break;
+ case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPGTBZ256rrk; break;
+ case X86::VPCMPBZrmi: NewOpc = X86::VPCMPGTBZrm; break;
+ case X86::VPCMPBZrmik: NewOpc = X86::VPCMPGTBZrmk; break;
+ case X86::VPCMPBZrri: NewOpc = X86::VPCMPGTBZrr; break;
+ case X86::VPCMPBZrrik: NewOpc = X86::VPCMPGTBZrrk; break;
+ case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPGTDZ128rm; break;
+ case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPGTDZ128rmb; break;
+ case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPGTDZ128rmbk; break;
+ case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPGTDZ128rmk; break;
+ case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPGTDZ128rr; break;
+ case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPGTDZ128rrk; break;
+ case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPGTDZ256rm; break;
+ case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPGTDZ256rmb; break;
+ case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPGTDZ256rmbk; break;
+ case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPGTDZ256rmk; break;
+ case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPGTDZ256rr; break;
+ case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPGTDZ256rrk; break;
+ case X86::VPCMPDZrmi: NewOpc = X86::VPCMPGTDZrm; break;
+ case X86::VPCMPDZrmib: NewOpc = X86::VPCMPGTDZrmb; break;
+ case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPGTDZrmbk; break;
+ case X86::VPCMPDZrmik: NewOpc = X86::VPCMPGTDZrmk; break;
+ case X86::VPCMPDZrri: NewOpc = X86::VPCMPGTDZrr; break;
+ case X86::VPCMPDZrrik: NewOpc = X86::VPCMPGTDZrrk; break;
+ case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPGTQZ128rm; break;
+ case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPGTQZ128rmb; break;
+ case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPGTQZ128rmbk; break;
+ case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPGTQZ128rmk; break;
+ case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPGTQZ128rr; break;
+ case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPGTQZ128rrk; break;
+ case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPGTQZ256rm; break;
+ case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPGTQZ256rmb; break;
+ case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPGTQZ256rmbk; break;
+ case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPGTQZ256rmk; break;
+ case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPGTQZ256rr; break;
+ case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPGTQZ256rrk; break;
+ case X86::VPCMPQZrmi: NewOpc = X86::VPCMPGTQZrm; break;
+ case X86::VPCMPQZrmib: NewOpc = X86::VPCMPGTQZrmb; break;
+ case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPGTQZrmbk; break;
+ case X86::VPCMPQZrmik: NewOpc = X86::VPCMPGTQZrmk; break;
+ case X86::VPCMPQZrri: NewOpc = X86::VPCMPGTQZrr; break;
+ case X86::VPCMPQZrrik: NewOpc = X86::VPCMPGTQZrrk; break;
+ case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPGTWZ128rm; break;
+ case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPGTWZ128rmk; break;
+ case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPGTWZ128rr; break;
+ case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPGTWZ128rrk; break;
+ case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPGTWZ256rm; break;
+ case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPGTWZ256rmk; break;
+ case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPGTWZ256rr; break;
+ case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPGTWZ256rrk; break;
+ case X86::VPCMPWZrmi: NewOpc = X86::VPCMPGTWZrm; break;
+ case X86::VPCMPWZrmik: NewOpc = X86::VPCMPGTWZrmk; break;
+ case X86::VPCMPWZrri: NewOpc = X86::VPCMPGTWZrr; break;
+ case X86::VPCMPWZrrik: NewOpc = X86::VPCMPGTWZrrk; break;
+ }
+
+ OutMI.setOpcode(NewOpc);
+ OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1));
+ break;
+ }
+
break;
}
+ // CALL64r, CALL64pcrel32 - These instructions used to have
+ // register inputs modeled as normal uses instead of implicit uses. As such,
+ // they we used to truncate off all but the first operand (the callee). This
+ // issue seems to have been fixed at some point. This assert verifies that.
+ case X86::CALL64r:
+ case X86::CALL64pcrel32:
+ assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!");
+ break;
+
case X86::EH_RETURN:
case X86::EH_RETURN64: {
OutMI = MCInst();
@@ -539,36 +743,30 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
break;
}
- // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump
- // instruction.
- {
- unsigned Opcode;
- case X86::TAILJMPr:
- Opcode = X86::JMP32r;
- goto SetTailJmpOpcode;
- case X86::TAILJMPd:
- case X86::TAILJMPd64:
- Opcode = X86::JMP_1;
- goto SetTailJmpOpcode;
-
- SetTailJmpOpcode:
- MCOperand Saved = OutMI.getOperand(0);
- OutMI = MCInst();
- OutMI.setOpcode(Opcode);
- OutMI.addOperand(Saved);
- break;
- }
+ // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump
+ // instruction.
+ case X86::TAILJMPr:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64:
+ assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!");
+ OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
+ break;
case X86::TAILJMPd_CC:
- case X86::TAILJMPd64_CC: {
- MCOperand Saved = OutMI.getOperand(0);
- MCOperand Saved2 = OutMI.getOperand(1);
- OutMI = MCInst();
- OutMI.setOpcode(X86::JCC_1);
- OutMI.addOperand(Saved);
- OutMI.addOperand(Saved2);
+ case X86::TAILJMPd64_CC:
+ assert(OutMI.getNumOperands() == 2 && "Unexpected number of operands!");
+ OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
+ break;
+
+ case X86::TAILJMPm:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPm64_REX:
+ assert(OutMI.getNumOperands() == X86::AddrNumOperands &&
+ "Unexpected number of operands!");
+ OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
break;
- }
case X86::DEC16r:
case X86::DEC32r:
@@ -958,7 +1156,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
// FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
// <opcode>, <operands>
- unsigned DefRegister = FaultingMI.getOperand(0).getReg();
+ Register DefRegister = FaultingMI.getOperand(0).getReg();
FaultMaps::FaultKind FK =
static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
@@ -1079,7 +1277,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
// Emit MOV to materialize the target address and the CALL to target.
// This is encoded with 12-13 bytes, depending on which register is used.
- unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
+ Register ScratchReg = MI.getOperand(ScratchIdx).getReg();
if (X86II::isX86_64ExtendedReg(ScratchReg))
EncodedBytes = 13;
else
@@ -1369,6 +1567,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
recordSled(CurSled, MI, SledKind::TAIL_CALL);
unsigned OpCode = MI.getOperand(0).getImm();
+ OpCode = convertTailJumpOpcode(OpCode);
MCInst TC;
TC.setOpcode(OpCode);
@@ -1538,8 +1737,6 @@ static void printConstant(const Constant *COp, raw_ostream &CS) {
void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
assert(getSubtarget().isOSWindows() && "SEH_ instruction Windows only");
- const X86RegisterInfo *RI =
- MF->getSubtarget<X86Subtarget>().getRegisterInfo();
// Use the .cv_fpo directives if we're emitting CodeView on 32-bit x86.
if (EmitFPOData) {
@@ -1577,17 +1774,16 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
// Otherwise, use the .seh_ directives for all other Windows platforms.
switch (MI->getOpcode()) {
case X86::SEH_PushReg:
- OutStreamer->EmitWinCFIPushReg(
- RI->getSEHRegNum(MI->getOperand(0).getImm()));
+ OutStreamer->EmitWinCFIPushReg(MI->getOperand(0).getImm());
break;
case X86::SEH_SaveReg:
- OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ OutStreamer->EmitWinCFISaveReg(MI->getOperand(0).getImm(),
MI->getOperand(1).getImm());
break;
case X86::SEH_SaveXMM:
- OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ OutStreamer->EmitWinCFISaveXMM(MI->getOperand(0).getImm(),
MI->getOperand(1).getImm());
break;
@@ -1596,9 +1792,8 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
break;
case X86::SEH_SetFrame:
- OutStreamer->EmitWinCFISetFrame(
- RI->getSEHRegNum(MI->getOperand(0).getImm()),
- MI->getOperand(1).getImm());
+ OutStreamer->EmitWinCFISetFrame(MI->getOperand(0).getImm(),
+ MI->getOperand(1).getImm());
break;
case X86::SEH_PushFrame:
@@ -1650,7 +1845,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::EH_RETURN:
case X86::EH_RETURN64: {
// Lower these as normal, but add some comments.
- unsigned Reg = MI->getOperand(0).getReg();
+ Register Reg = MI->getOperand(0).getReg();
OutStreamer->AddComment(StringRef("eh_return, addr: %") +
X86ATTInstPrinter::getRegisterName(Reg));
break;
@@ -1697,11 +1892,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::MASKPAIR16LOAD: {
int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm();
assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
- const X86RegisterInfo *RI =
- MF->getSubtarget<X86Subtarget>().getRegisterInfo();
- unsigned Reg = MI->getOperand(0).getReg();
- unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
- unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
+ Register Reg = MI->getOperand(0).getReg();
+ Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
+ Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
// Load the first mask register
MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm);
@@ -1730,11 +1923,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::MASKPAIR16STORE: {
int64_t Disp = MI->getOperand(X86::AddrDisp).getImm();
assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
- const X86RegisterInfo *RI =
- MF->getSubtarget<X86Subtarget>().getRegisterInfo();
- unsigned Reg = MI->getOperand(X86::AddrNumOperands).getReg();
- unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
- unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
+ Register Reg = MI->getOperand(X86::AddrNumOperands).getReg();
+ Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
+ Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
// Store the first mask register
MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk);
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index d7e535598d81..5cb80a082b56 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -36,6 +36,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// is stashed.
signed char RestoreBasePointerOffset = 0;
+ /// WinEHXMMSlotInfo - Slot information of XMM registers in the stack frame
+ /// in bytes.
+ DenseMap<int, unsigned> WinEHXMMSlotInfo;
+
/// CalleeSavedFrameSize - Size of the callee-saved register portion of the
/// stack frame in bytes.
unsigned CalleeSavedFrameSize = 0;
@@ -120,6 +124,10 @@ public:
void setRestoreBasePointer(const MachineFunction *MF);
int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
+ DenseMap<int, unsigned>& getWinEHXMMSlotInfo() { return WinEHXMMSlotInfo; }
+ const DenseMap<int, unsigned>& getWinEHXMMSlotInfo() const {
+ return WinEHXMMSlotInfo; }
+
unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
index 7f75598b0655..1aee01563c4b 100644
--- a/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -198,8 +198,7 @@ static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) {
static inline bool isIdenticalOp(const MachineOperand &MO1,
const MachineOperand &MO2) {
return MO1.isIdenticalTo(MO2) &&
- (!MO1.isReg() ||
- !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+ (!MO1.isReg() || !Register::isPhysicalRegister(MO1.getReg()));
}
#ifndef NDEBUG
@@ -235,9 +234,9 @@ static inline bool isLEA(const MachineInstr &MI) {
namespace {
-class OptimizeLEAPass : public MachineFunctionPass {
+class X86OptimizeLEAPass : public MachineFunctionPass {
public:
- OptimizeLEAPass() : MachineFunctionPass(ID) {}
+ X86OptimizeLEAPass() : MachineFunctionPass(ID) {}
StringRef getPassName() const override { return "X86 LEA Optimize"; }
@@ -246,6 +245,8 @@ public:
/// been calculated by LEA. Also, remove redundant LEAs.
bool runOnMachineFunction(MachineFunction &MF) override;
+ static char ID;
+
private:
using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>;
@@ -296,18 +297,18 @@ private:
MachineRegisterInfo *MRI;
const X86InstrInfo *TII;
const X86RegisterInfo *TRI;
-
- static char ID;
};
} // end anonymous namespace
-char OptimizeLEAPass::ID = 0;
+char X86OptimizeLEAPass::ID = 0;
-FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
+FunctionPass *llvm::createX86OptimizeLEAs() { return new X86OptimizeLEAPass(); }
+INITIALIZE_PASS(X86OptimizeLEAPass, DEBUG_TYPE, "X86 optimize LEA pass", false,
+ false)
-int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
- const MachineInstr &Last) {
+int X86OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
+ const MachineInstr &Last) {
// Both instructions must be in the same basic block and they must be
// presented in InstrPos.
assert(Last.getParent() == First.getParent() &&
@@ -328,10 +329,9 @@ int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
// 3) Displacement of the new memory operand should fit in 1 byte if possible.
// 4) The LEA should be as close to MI as possible, and prior to it if
// possible.
-bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
- const MachineInstr &MI,
- MachineInstr *&BestLEA,
- int64_t &AddrDispShift, int &Dist) {
+bool X86OptimizeLEAPass::chooseBestLEA(
+ const SmallVectorImpl<MachineInstr *> &List, const MachineInstr &MI,
+ MachineInstr *&BestLEA, int64_t &AddrDispShift, int &Dist) {
const MachineFunction *MF = MI.getParent()->getParent();
const MCInstrDesc &Desc = MI.getDesc();
int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags) +
@@ -387,9 +387,10 @@ bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
// Get the difference between the addresses' displacements of the two
// instructions \p MI1 and \p MI2. The numbers of the first memory operands are
// passed through \p N1 and \p N2.
-int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1,
- const MachineInstr &MI2,
- unsigned N2) const {
+int64_t X86OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1,
+ unsigned N1,
+ const MachineInstr &MI2,
+ unsigned N2) const {
const MachineOperand &Op1 = MI1.getOperand(N1 + X86::AddrDisp);
const MachineOperand &Op2 = MI2.getOperand(N2 + X86::AddrDisp);
@@ -411,9 +412,9 @@ int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1,
// 2) Def registers of LEAs belong to the same class.
// 3) All uses of the Last LEA def register are replaceable, thus the
// register is used only as address base.
-bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
- const MachineInstr &Last,
- int64_t &AddrDispShift) const {
+bool X86OptimizeLEAPass::isReplaceable(const MachineInstr &First,
+ const MachineInstr &Last,
+ int64_t &AddrDispShift) const {
assert(isLEA(First) && isLEA(Last) &&
"The function works only with LEA instructions");
@@ -467,7 +468,8 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
return true;
}
-void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) {
+void X86OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
+ MemOpMap &LEAs) {
unsigned Pos = 0;
for (auto &MI : MBB) {
// Assign the position number to the instruction. Note that we are going to
@@ -485,7 +487,7 @@ void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) {
// Try to find load and store instructions which recalculate addresses already
// calculated by some LEA and replace their memory operands with its def
// register.
-bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
+bool X86OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
bool Changed = false;
assert(!LEAs.empty());
@@ -564,9 +566,9 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
return Changed;
}
-MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
- unsigned VReg,
- int64_t AddrDispShift) {
+MachineInstr *X86OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
+ unsigned VReg,
+ int64_t AddrDispShift) {
DIExpression *Expr = const_cast<DIExpression *>(MI.getDebugExpression());
if (AddrDispShift != 0)
Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift);
@@ -583,7 +585,7 @@ MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
}
// Try to find similar LEAs in the list and replace one with another.
-bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
+bool X86OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
bool Changed = false;
// Loop over all entries in the table.
@@ -613,8 +615,8 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
// Loop over all uses of the Last LEA and update their operands. Note
// that the correctness of this has already been checked in the
// isReplaceable function.
- unsigned FirstVReg = First.getOperand(0).getReg();
- unsigned LastVReg = Last.getOperand(0).getReg();
+ Register FirstVReg = First.getOperand(0).getReg();
+ Register LastVReg = Last.getOperand(0).getReg();
for (auto UI = MRI->use_begin(LastVReg), UE = MRI->use_end();
UI != UE;) {
MachineOperand &MO = *UI++;
@@ -670,7 +672,7 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
return Changed;
}
-bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
+bool X86OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
if (DisableX86LEAOpt || skipFunction(MF.getFunction()))
diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp
index 78fede3dcde2..daddf4231897 100644
--- a/lib/Target/X86/X86RegisterBankInfo.cpp
+++ b/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -46,7 +46,9 @@ const RegisterBank &X86RegisterBankInfo::getRegBankFromRegClass(
if (X86::GR8RegClass.hasSubClassEq(&RC) ||
X86::GR16RegClass.hasSubClassEq(&RC) ||
X86::GR32RegClass.hasSubClassEq(&RC) ||
- X86::GR64RegClass.hasSubClassEq(&RC))
+ X86::GR64RegClass.hasSubClassEq(&RC) ||
+ X86::LOW32_ADDR_ACCESSRegClass.hasSubClassEq(&RC) ||
+ X86::LOW32_ADDR_ACCESS_RBPRegClass.hasSubClassEq(&RC))
return getRegBank(X86::GPRRegBankID);
if (X86::FR32XRegClass.hasSubClassEq(&RC) ||
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 2e2f1f9e438a..ff625325b4c9 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -544,7 +544,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
"Stack realignment in presence of dynamic allocas is not supported with"
"this calling convention.");
- unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
+ Register BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true);
I.isValid(); ++I)
Reserved.set(*I);
@@ -677,13 +677,13 @@ static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) {
MI.getOperand(4).getImm() != 0 ||
MI.getOperand(5).getReg() != X86::NoRegister)
return false;
- unsigned BasePtr = MI.getOperand(1).getReg();
+ Register BasePtr = MI.getOperand(1).getReg();
// In X32 mode, ensure the base-pointer is a 32-bit operand, so the LEA will
// be replaced with a 32-bit operand MOV which will zero extend the upper
// 32-bits of the super register.
if (Opc == X86::LEA64_32r)
BasePtr = getX86SubSuperRegister(BasePtr, 32);
- unsigned NewDestReg = MI.getOperand(0).getReg();
+ Register NewDestReg = MI.getOperand(0).getReg();
const X86InstrInfo *TII =
MI.getParent()->getParent()->getSubtarget<X86Subtarget>().getInstrInfo();
TII->copyPhysReg(*MI.getParent(), II, MI.getDebugLoc(), NewDestReg, BasePtr,
@@ -692,12 +692,27 @@ static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) {
return true;
}
+static bool isFuncletReturnInstr(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case X86::CATCHRET:
+ case X86::CLEANUPRET:
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("impossible");
+}
+
void
X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
MachineInstr &MI = *II;
- MachineFunction &MF = *MI.getParent()->getParent();
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ bool IsEHFuncletEpilogue = MBBI == MBB.end() ? false
+ : isFuncletReturnInstr(*MBBI);
const X86FrameLowering *TFI = getFrameLowering(MF);
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
@@ -709,6 +724,8 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
"Return instruction can only reference SP relative frame objects");
FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0);
+ } else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) {
+ FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr);
} else {
FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr);
}
@@ -729,7 +746,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// register as source operand, semantic is the same and destination is
// 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
// Don't change BasePtr since it is used later for stack adjustment.
- unsigned MachineBasePtr = BasePtr;
+ Register MachineBasePtr = BasePtr;
if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
MachineBasePtr = getX86SubSuperRegister(BasePtr, 64);
@@ -773,7 +790,7 @@ Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
unsigned
X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
- unsigned FrameReg = getFrameRegister(MF);
+ Register FrameReg = getFrameRegister(MF);
if (Subtarget.isTarget64BitILP32())
FrameReg = getX86SubSuperRegister(FrameReg, 32);
return FrameReg;
@@ -782,7 +799,7 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
unsigned
X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const {
const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
- unsigned StackReg = getStackRegister();
+ Register StackReg = getStackRegister();
if (Subtarget.isTarget64BitILP32())
StackReg = getX86SubSuperRegister(StackReg, 32);
return StackReg;
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
index b435b22e8ac7..f8464c7e8298 100644
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -58,8 +58,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
MachineFunctionPass::getAnalysisUsage(AU);
- AU.addRequired<MachineModuleInfo>();
- AU.addPreserved<MachineModuleInfo>();
+ AU.addRequired<MachineModuleInfoWrapperPass>();
+ AU.addPreserved<MachineModuleInfoWrapperPass>();
}
private:
@@ -97,7 +97,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
TII = STI->getInstrInfo();
Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64;
- MMI = &getAnalysis<MachineModuleInfo>();
+ MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
Module &M = const_cast<Module &>(*MMI->getModule());
// If this function is not a thunk, check to see if we need to insert
@@ -279,7 +279,7 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF,
CallTarget->addLiveIn(Reg);
CallTarget->setHasAddressTaken();
- CallTarget->setAlignment(4);
+ CallTarget->setAlignment(Align(16));
insertRegReturnAddrClobber(*CallTarget, Reg);
CallTarget->back().setPreInstrSymbol(MF, TargetSym);
BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td
index 7574e4b8f896..9b1fcaa8a13d 100755
--- a/lib/Target/X86/X86SchedBroadwell.td
+++ b/lib/Target/X86/X86SchedBroadwell.td
@@ -232,8 +232,12 @@ defm : X86WriteRes<WriteFStoreY, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTX, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
-defm : X86WriteRes<WriteFMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+
defm : X86WriteRes<WriteFMove, [BWPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [BWPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [BWPort5], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 284d1567c5c6..06f417501b21 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -231,8 +231,12 @@ defm : X86WriteRes<WriteFStoreY, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTX, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
-defm : X86WriteRes<WriteFMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+
defm : X86WriteRes<WriteFMove, [HWPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [HWPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [HWPort5], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SchedPredicates.td b/lib/Target/X86/X86SchedPredicates.td
index 41bd776648f7..76001d382a27 100644
--- a/lib/Target/X86/X86SchedPredicates.td
+++ b/lib/Target/X86/X86SchedPredicates.td
@@ -84,3 +84,60 @@ def IsSETAm_Or_SETBEm : CheckAny<[
CheckImmOperand_s<5, "X86::COND_A">,
CheckImmOperand_s<5, "X86::COND_BE">
]>;
+
+// A predicate used to check if an instruction has a LOCK prefix.
+def CheckLockPrefix : CheckFunctionPredicate<
+ "X86_MC::hasLockPrefix",
+ "X86InstrInfo::hasLockPrefix"
+>;
+
+def IsRegRegCompareAndSwap_8 : CheckOpcode<[ CMPXCHG8rr ]>;
+
+def IsRegMemCompareAndSwap_8 : CheckOpcode<[
+ LCMPXCHG8, CMPXCHG8rm
+]>;
+
+def IsRegRegCompareAndSwap_16_32_64 : CheckOpcode<[
+ CMPXCHG16rr, CMPXCHG32rr, CMPXCHG64rr
+]>;
+
+def IsRegMemCompareAndSwap_16_32_64 : CheckOpcode<[
+ CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm,
+ LCMPXCHG16, LCMPXCHG32, LCMPXCHG64,
+ LCMPXCHG8B, LCMPXCHG16B
+]>;
+
+def IsCompareAndSwap8B : CheckOpcode<[ CMPXCHG8B, LCMPXCHG8B ]>;
+def IsCompareAndSwap16B : CheckOpcode<[ CMPXCHG16B, LCMPXCHG16B ]>;
+
+def IsRegMemCompareAndSwap : CheckOpcode<
+ !listconcat(
+ IsRegMemCompareAndSwap_8.ValidOpcodes,
+ IsRegMemCompareAndSwap_16_32_64.ValidOpcodes
+ )>;
+
+def IsRegRegCompareAndSwap : CheckOpcode<
+ !listconcat(
+ IsRegRegCompareAndSwap_8.ValidOpcodes,
+ IsRegRegCompareAndSwap_16_32_64.ValidOpcodes
+ )>;
+
+def IsAtomicCompareAndSwap_8 : CheckAll<[
+ CheckLockPrefix,
+ IsRegMemCompareAndSwap_8
+]>;
+
+def IsAtomicCompareAndSwap : CheckAll<[
+ CheckLockPrefix,
+ IsRegMemCompareAndSwap
+]>;
+
+def IsAtomicCompareAndSwap8B : CheckAll<[
+ CheckLockPrefix,
+ IsCompareAndSwap8B
+]>;
+
+def IsAtomicCompareAndSwap16B : CheckAll<[
+ CheckLockPrefix,
+ IsCompareAndSwap16B
+]>;
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index d40bdf728a48..26d4d8fa3549 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -208,8 +208,12 @@ defm : X86WriteRes<WriteFStoreY, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNTX, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>;
-defm : X86WriteRes<WriteFMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
-defm : X86WriteRes<WriteFMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStore64, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+
defm : X86WriteRes<WriteFMove, [SBPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [SBPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [SBPort5], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td
index 8f3e4ae62d53..9a511ecc0071 100644
--- a/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/lib/Target/X86/X86SchedSkylakeClient.td
@@ -226,8 +226,12 @@ defm : X86WriteRes<WriteFStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTX, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+
defm : X86WriteRes<WriteFMove, [SKLPort015], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [SKLPort015], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [SKLPort015], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td
index 58caf1dacfcb..a8c65435ab9b 100755
--- a/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/lib/Target/X86/X86SchedSkylakeServer.td
@@ -226,8 +226,12 @@ defm : X86WriteRes<WriteFStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTX, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+
defm : X86WriteRes<WriteFMove, [SKXPort015], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [SKXPort015], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [SKXPort015], 1, [1], 1>;
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 55ca85ec1e3d..95f710061aeb 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -102,6 +102,12 @@ class X86SchedWriteMoveLS<SchedWrite MoveRR,
SchedWrite MR = StoreMR;
}
+// Multiclass that wraps masked load/store writes for a vector width.
+class X86SchedWriteMaskMove<SchedWrite LoadRM, SchedWrite StoreMR> {
+ SchedWrite RM = LoadRM;
+ SchedWrite MR = StoreMR;
+}
+
// Multiclass that wraps X86SchedWriteMoveLS for each vector width.
class X86SchedWriteMoveLSWidths<X86SchedWriteMoveLS sScl,
X86SchedWriteMoveLS s128,
@@ -218,8 +224,12 @@ def WriteFStoreY : SchedWrite;
def WriteFStoreNT : SchedWrite;
def WriteFStoreNTX : SchedWrite;
def WriteFStoreNTY : SchedWrite;
-def WriteFMaskedStore : SchedWrite;
-def WriteFMaskedStoreY : SchedWrite;
+
+def WriteFMaskedStore32 : SchedWrite;
+def WriteFMaskedStore64 : SchedWrite;
+def WriteFMaskedStore32Y : SchedWrite;
+def WriteFMaskedStore64Y : SchedWrite;
+
def WriteFMove : SchedWrite;
def WriteFMoveX : SchedWrite;
def WriteFMoveY : SchedWrite;
@@ -530,6 +540,16 @@ def SchedWriteVecMoveLSNT
: X86SchedWriteMoveLSWidths<WriteVecMoveLSNT, WriteVecMoveLSNTX,
WriteVecMoveLSNTY, WriteVecMoveLSNTY>;
+// Conditional SIMD Packed Loads and Stores wrappers.
+def WriteFMaskMove32
+ : X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore32>;
+def WriteFMaskMove64
+ : X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore64>;
+def WriteFMaskMove32Y
+ : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore32Y>;
+def WriteFMaskMove64Y
+ : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore64Y>;
+
// Vector width wrappers.
def SchedWriteFAdd
: X86SchedWriteWidths<WriteFAdd, WriteFAddX, WriteFAddY, WriteFAddZ>;
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index b0334655de7e..78acb1065ec8 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -216,8 +216,10 @@ defm : X86WriteResUnsupported<WriteFStoreY>;
def : WriteRes<WriteFStoreNT, [AtomPort0]>;
def : WriteRes<WriteFStoreNTX, [AtomPort0]>;
defm : X86WriteResUnsupported<WriteFStoreNTY>;
-defm : X86WriteResUnsupported<WriteFMaskedStore>;
-defm : X86WriteResUnsupported<WriteFMaskedStoreY>;
+defm : X86WriteResUnsupported<WriteFMaskedStore32>;
+defm : X86WriteResUnsupported<WriteFMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteFMaskedStore64>;
+defm : X86WriteResUnsupported<WriteFMaskedStore64Y>;
def : WriteRes<WriteFMove, [AtomPort01]>;
def : WriteRes<WriteFMoveX, [AtomPort01]>;
diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td
index 8cc01c3acece..d7aea3cf4e9d 100644
--- a/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/lib/Target/X86/X86ScheduleBdVer2.td
@@ -726,8 +726,10 @@ defm : PdWriteRes<WriteFStoreNT, [PdStore, PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFStoreNTX, [PdStore, PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFStoreNTY, [PdStore, PdFPU1, PdFPSTO], 3, [2, 2, 2], 4>;
-defm : PdWriteRes<WriteFMaskedStore, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
-defm : PdWriteRes<WriteFMaskedStoreY, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
+defm : PdWriteRes<WriteFMaskedStore32, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
+defm : PdWriteRes<WriteFMaskedStore64, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
+defm : PdWriteRes<WriteFMaskedStore32Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
+defm : PdWriteRes<WriteFMaskedStore64Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>;
defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>;
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index 2d26232b4132..d0421d94ee05 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -180,9 +180,11 @@ multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
// Instructions that have local forwarding disabled have an extra +1cy latency.
-// A folded store needs a cycle on the SAGU for the store data,
-// most RMW instructions don't need an extra uop.
-defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>;
+// A folded store needs a cycle on the SAGU for the store data, most RMW
+// instructions don't need an extra uop. ALU RMW operations don't seem to
+// benefit from STLF, and their observed latency is 6cy. That is the reason why
+// this write adds two extra cycles (instead of just 1cy for the store).
+defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>;
////////////////////////////////////////////////////////////////////////////////
// Arithmetic.
@@ -191,22 +193,22 @@ defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>;
defm : JWriteResIntPair<WriteALU, [JALU01], 1>;
defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>;
-defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>;
-defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>;
-defm : X86WriteRes<WriteCMPXCHG,[JALU01], 1, [1], 1>;
-defm : X86WriteRes<WriteCMPXCHGRMW,[JALU01, JSAGU, JLAGU], 4, [1, 1, 1], 2>;
-defm : X86WriteRes<WriteXCHG, [JALU01], 1, [1], 1>;
-
-defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>;
-defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 2>;
-defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 2>;
+defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHG, [JALU01], 3, [3], 5>;
+defm : X86WriteRes<WriteCMPXCHGRMW, [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
+defm : X86WriteRes<WriteXCHG, [JALU01], 1, [2], 2>;
+
+defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 3], 3>;
+defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>;
+defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 2], 2>;
+defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>;
+defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
+defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>;
defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>;
@@ -305,6 +307,192 @@ def : WriteRes<WriteFence, [JSAGU]>;
// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
+def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
+ let Latency = 3;
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+
+def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 16;
+ let ResourceCycles = [3,16,16];
+ let NumMicroOps = 5;
+}
+
+def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 17;
+ let ResourceCycles = [3,17,17];
+ let NumMicroOps = 6;
+}
+
+def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 11;
+ let ResourceCycles = [3,1,1];
+ let NumMicroOps = 5;
+}
+
+def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 11;
+ let ResourceCycles = [3,1,1];
+ let NumMicroOps = 18;
+}
+
+def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 32;
+ let ResourceCycles = [6,1,1];
+ let NumMicroOps = 28;
+}
+
+def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 19;
+ let ResourceCycles = [3,19,19];
+ let NumMicroOps = 18;
+}
+
+def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 38;
+ let ResourceCycles = [6,38,38];
+ let NumMicroOps = 28;
+}
+
+def JWriteCMPXCHGVariant : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>, [JWriteLOCK_CMPXCHG8B]>,
+ SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
+ SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>, [JWriteLOCK_CMPXCHG8rm]>,
+ SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>, [JWriteLOCK_CMPXCHGrm]>,
+ SchedVar<MCSchedPredicate<IsCompareAndSwap8B>, [JWriteCMPXCHG8B]>,
+ SchedVar<MCSchedPredicate<IsCompareAndSwap16B>, [JWriteCMPXCHG16B]>,
+ SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>, [JWriteCMPXCHG8rm]>,
+ SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>, [WriteCMPXCHGRMW]>,
+ SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>, [JWriteCMPXCHG8rr]>,
+ SchedVar<NoSchedPred, [WriteCMPXCHG]>
+]>;
+
+// The first five reads are contributed by the memory load operand.
+// We ignore those reads and set a read-advance for the other input operands
+// including the implicit read of RAX.
+def : InstRW<[JWriteCMPXCHGVariant,
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16,
+ LCMPXCHG32, LCMPXCHG64,
+ CMPXCHG8rm, CMPXCHG16rm,
+ CMPXCHG32rm, CMPXCHG64rm)>;
+
+def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr,
+ CMPXCHG32rr, CMPXCHG64rr)>;
+
+def : InstRW<[JWriteCMPXCHGVariant,
+ // Ignore reads contributed by the memory operand.
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ // Add a read-advance to every implicit register read.
+ ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B,
+ CMPXCHG8B, CMPXCHG16B)>;
+
+def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 19;
+ let ResourceCycles = [1,19,19];
+ let NumMicroOps = 1;
+}
+
+def JWriteLOCK_ALURMWVariant : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>,
+ SchedVar<NoSchedPred, [WriteALURMW]>
+]>;
+def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
+ DEC8m, DEC16m, DEC32m, DEC64m,
+ NOT8m, NOT16m, NOT32m, NOT64m,
+ NEG8m, NEG16m, NEG32m, NEG64m)>;
+
+def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
+ let Latency = 2;
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
+ XADD32rr, XADD64rr)>;
+
+// This write defines the latency of the in/out register operand of a non-atomic
+// XADDrm. This is the first of a pair of writes that model non-atomic
+// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
+//
+// We need two writes because the instruction latency differs from the output
+// register operand latency. In particular, the first write describes the first
+// (and only) output register operand of the instruction. However, the
+// instruction latency is set to the MAX of all the write latencies. That's why
+// a second write is needed in this case (see example below).
+//
+// Example:
+// XADD %ecx, (%rsp) ## Instruction latency: 11cy
+// ## ECX write Latency: 3cy
+//
+// Register ECX becomes available in 3 cycles. That is because the value of ECX
+// is exchanged with the value read from the stack pointer, and the load-to-use
+// latency is assumed to be 3cy.
+def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+ let Latency = 3; // load-to-use latency
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+
+// This write defines the latency of the in/out register operand of an atomic
+// XADDrm. This is the first of a sequence of two writes used to model atomic
+// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
+//
+//
+// Example:
+// LOCK XADD %ecx, (%rsp) ## Instruction Latency: 16cy
+// ## ECX write Latency: 11cy
+//
+// The value of ECX becomes available only after 11cy from the start of
+// execution. This write is used to specifically set that operand latency.
+def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+ let Latency = 11;
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+
+// This write defines the latency of the in/out register operand of an atomic
+// XCHGrm. This write is the first of a sequence of two writes that describe
+// atomic XCHG operations. We need two writes because the instruction latency
+// differs from the output register write latency. We want to make sure that
+// the output register operand becomes visible after 11cy. However, we want to
+// set the instruction latency to 16cy.
+def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+ let Latency = 11;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
+ let Latency = 11;
+ let ResourceCycles = [1, 1];
+ let NumMicroOps = 1;
+}
+
+def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
+ let Latency = 16;
+ let ResourceCycles = [16, 16];
+ let NumMicroOps = 1;
+}
+
+def JWriteXADDrm_Part1 : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
+ SchedVar<NoSchedPred, [JWriteXADDrm_XCHG_Part]>
+]>;
+
+def JWriteXADDrm_Part2 : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
+ SchedVar<NoSchedPred, [JWriteXADDrm_LdSt_Part]>
+]>;
+
+def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
+ (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
+ LXADD8, LXADD16, LXADD32, LXADD64)>;
+
+def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
+ (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
+
+
////////////////////////////////////////////////////////////////////////////////
// Floating point. This covers both scalar and vector operations.
////////////////////////////////////////////////////////////////////////////////
@@ -313,19 +501,22 @@ defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>;
defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>;
defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>;
defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteFLoadX, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteFLoadY, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFLoadX, [JLAGU], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [JLAGU], 5, [2], 2>;
defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>;
defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>;
defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>;
defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>;
-defm : X86WriteRes<WriteFMaskedStore, [JSAGU, JFPU01, JFPX], 6, [1, 1, 4], 1>;
-defm : X86WriteRes<WriteFMaskedStoreY, [JSAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
+defm : X86WriteRes<WriteFMaskedStore64, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>;
defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>;
@@ -466,8 +657,8 @@ defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
////////////////////////////////////////////////////////////////////////////////
defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteVecLoadX, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteVecLoadY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [JLAGU], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [JLAGU], 5, [2], 2>;
defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
@@ -475,7 +666,7 @@ defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 4, 4],
defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>;
defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>;
defm : X86WriteRes<WriteVecMaskedStore, [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>;
@@ -631,6 +822,18 @@ def JWriteJVZEROUPPER: SchedWriteRes<[]> {
def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
///////////////////////////////////////////////////////////////////////////////
+// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
+///////////////////////////////////////////////////////////////////////////////
+
+def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
+ let Latency = 34;
+ let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
+ let NumMicroOps = 63;
+}
+def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
+ VMASKMOVDQU, VMASKMOVDQU64)>;
+
+///////////////////////////////////////////////////////////////////////////////
// SchedWriteVariant definitions.
///////////////////////////////////////////////////////////////////////////////
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
index 34c251a5c5bb..8e3ce721f1a1 100644
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -186,8 +186,12 @@ def : WriteRes<WriteFStoreY, [SLM_MEC_RSV]>;
def : WriteRes<WriteFStoreNT, [SLM_MEC_RSV]>;
def : WriteRes<WriteFStoreNTX, [SLM_MEC_RSV]>;
def : WriteRes<WriteFStoreNTY, [SLM_MEC_RSV]>;
-def : WriteRes<WriteFMaskedStore, [SLM_MEC_RSV]>;
-def : WriteRes<WriteFMaskedStoreY, [SLM_MEC_RSV]>;
+
+def : WriteRes<WriteFMaskedStore32, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFMaskedStore32Y, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFMaskedStore64, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFMaskedStore64Y, [SLM_MEC_RSV]>;
+
def : WriteRes<WriteFMove, [SLM_FPC_RSV01]>;
def : WriteRes<WriteFMoveX, [SLM_FPC_RSV01]>;
def : WriteRes<WriteFMoveY, [SLM_FPC_RSV01]>;
diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td
index 65f6d89df610..06201f4a3a84 100644
--- a/lib/Target/X86/X86ScheduleZnver1.td
+++ b/lib/Target/X86/X86ScheduleZnver1.td
@@ -268,8 +268,12 @@ defm : X86WriteRes<WriteFStoreY, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreNT, [ZnAGU,ZnFPU2], 8, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNTX, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreNTY, [ZnAGU], 1, [1], 1>;
-defm : X86WriteRes<WriteFMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
-defm : X86WriteRes<WriteFMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteFMaskedStore64, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+
defm : X86WriteRes<WriteFMove, [ZnFPU], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [ZnFPU], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [ZnFPU], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 50690953eef5..1ae8df977f83 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -36,7 +36,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
DAG.getSubtarget().getRegisterInfo());
- unsigned BaseReg = TRI->getBaseRegister();
+ Register BaseReg = TRI->getBaseRegister();
for (unsigned R : ClobberSet)
if (BaseReg == R)
return true;
diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 40f5dbe57e4b..b8980789258e 100644
--- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -477,7 +477,7 @@ bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
// Otherwise, just build the predicate state itself by zeroing a register
// as we don't need any initial state.
PS->InitialReg = MRI->createVirtualRegister(PS->RC);
- unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ Register PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
PredStateSubReg);
++NumInstsInserted;
@@ -750,7 +750,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
- unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+ Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
// Note that we intentionally use an empty debug location so that
// this picks up the preceding location.
auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
@@ -907,7 +907,7 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
MI.dump(); dbgs() << "\n");
report_fatal_error("Unable to unfold load!");
}
- unsigned Reg = MRI->createVirtualRegister(UnfoldedRC);
+ Register Reg = MRI->createVirtualRegister(UnfoldedRC);
SmallVector<MachineInstr *, 2> NewMIs;
// If we were able to compute an unfolded reg class, any failure here
// is just a programming error so just assert.
@@ -1102,7 +1102,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
// synthetic target in the predecessor. We do this at the bottom of the
// predecessor.
auto InsertPt = Pred->getFirstTerminator();
- unsigned TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass);
if (MF.getTarget().getCodeModel() == CodeModel::Small &&
!Subtarget->isPositionIndependent()) {
// Directly materialize it into an immediate.
@@ -1153,7 +1153,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
} else {
// Otherwise compute the address into a register first.
- unsigned AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass);
auto AddrI =
BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::LEA64r), AddrReg)
.addReg(/*Base*/ X86::RIP)
@@ -1175,7 +1175,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
// Now cmov over the predicate if the comparison wasn't equal.
int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
- unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+ Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
auto CMovI =
BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg)
.addReg(PS->InitialReg)
@@ -1878,7 +1878,7 @@ unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
DebugLoc Loc) {
// FIXME: Hard coding this to a 32-bit register class seems weird, but matches
// what instruction selection does.
- unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ Register Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
// We directly copy the FLAGS register and rely on later lowering to clean
// this up into the appropriate setCC instructions.
BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
@@ -1905,7 +1905,7 @@ void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
unsigned PredStateReg) {
- unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
+ Register TmpReg = MRI->createVirtualRegister(PS->RC);
// FIXME: This hard codes a shift distance based on the number of bits needed
// to stay canonical on 64-bit. We should compute this somehow and support
// 32-bit as part of that.
@@ -1925,8 +1925,8 @@ void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
DebugLoc Loc) {
- unsigned PredStateReg = MRI->createVirtualRegister(PS->RC);
- unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
+ Register PredStateReg = MRI->createVirtualRegister(PS->RC);
+ Register TmpReg = MRI->createVirtualRegister(PS->RC);
// We know that the stack pointer will have any preserved predicate state in
// its high bit. We just want to smear this across the other bits. Turns out,
@@ -2031,9 +2031,9 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
}
for (MachineOperand *Op : HardenOpRegs) {
- unsigned OpReg = Op->getReg();
+ Register OpReg = Op->getReg();
auto *OpRC = MRI->getRegClass(OpReg);
- unsigned TmpReg = MRI->createVirtualRegister(OpRC);
+ Register TmpReg = MRI->createVirtualRegister(OpRC);
// If this is a vector register, we'll need somewhat custom logic to handle
// hardening it.
@@ -2045,7 +2045,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
// Move our state into a vector register.
// FIXME: We could skip this at the cost of longer encodings with AVX-512
// but that doesn't seem likely worth it.
- unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
+ Register VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
auto MovI =
BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
.addReg(StateReg);
@@ -2054,7 +2054,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
// Broadcast it across the vector register.
- unsigned VBStateReg = MRI->createVirtualRegister(OpRC);
+ Register VBStateReg = MRI->createVirtualRegister(OpRC);
auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
TII->get(Is128Bit ? X86::VPBROADCASTQrr
: X86::VPBROADCASTQYrr),
@@ -2084,7 +2084,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
// Broadcast our state into a vector register.
- unsigned VStateReg = MRI->createVirtualRegister(OpRC);
+ Register VStateReg = MRI->createVirtualRegister(OpRC);
unsigned BroadcastOp =
Is128Bit ? X86::VPBROADCASTQrZ128r
: Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
@@ -2153,7 +2153,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
// See if we can sink hardening the loaded value.
auto SinkCheckToSingleUse =
[&](MachineInstr &MI) -> Optional<MachineInstr *> {
- unsigned DefReg = MI.getOperand(0).getReg();
+ Register DefReg = MI.getOperand(0).getReg();
// We need to find a single use which we can sink the check. We can
// primarily do this because many uses may already end up checked on their
@@ -2210,8 +2210,8 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
// If this register isn't a virtual register we can't walk uses of sanely,
// just bail. Also check that its register class is one of the ones we
// can harden.
- unsigned UseDefReg = UseMI.getOperand(0).getReg();
- if (!TRI->isVirtualRegister(UseDefReg) ||
+ Register UseDefReg = UseMI.getOperand(0).getReg();
+ if (!Register::isVirtualRegister(UseDefReg) ||
!canHardenRegister(UseDefReg))
return {};
@@ -2241,6 +2241,9 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
// We don't support post-load hardening of vectors.
return false;
+ unsigned RegIdx = Log2_32(RegBytes);
+ assert(RegIdx < 4 && "Unsupported register size");
+
// If this register class is explicitly constrained to a class that doesn't
// require REX prefix, we may not be able to satisfy that constraint when
// emitting the hardening instructions, so bail out here.
@@ -2251,13 +2254,13 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
const TargetRegisterClass *NOREXRegClasses[] = {
&X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
&X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
- if (RC == NOREXRegClasses[Log2_32(RegBytes)])
+ if (RC == NOREXRegClasses[RegIdx])
return false;
const TargetRegisterClass *GPRRegClasses[] = {
&X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
&X86::GR64RegClass};
- return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]);
+ return RC->hasSuperClassEq(GPRRegClasses[RegIdx]);
}
/// Harden a value in a register.
@@ -2278,7 +2281,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
DebugLoc Loc) {
assert(canHardenRegister(Reg) && "Cannot harden this register!");
- assert(TRI->isVirtualRegister(Reg) && "Cannot harden a physical register!");
+ assert(Register::isVirtualRegister(Reg) && "Cannot harden a physical register!");
auto *RC = MRI->getRegClass(Reg);
int Bytes = TRI->getRegSizeInBits(*RC) / 8;
@@ -2289,7 +2292,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
if (Bytes != 8) {
unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
- unsigned NarrowStateReg = MRI->createVirtualRegister(RC);
+ Register NarrowStateReg = MRI->createVirtualRegister(RC);
BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg)
.addReg(StateReg, 0, SubRegImm);
StateReg = NarrowStateReg;
@@ -2299,7 +2302,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
if (isEFLAGSLive(MBB, InsertPt, *TRI))
FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
- unsigned NewReg = MRI->createVirtualRegister(RC);
+ Register NewReg = MRI->createVirtualRegister(RC);
unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)];
auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg)
@@ -2329,13 +2332,13 @@ unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
DebugLoc Loc = MI.getDebugLoc();
auto &DefOp = MI.getOperand(0);
- unsigned OldDefReg = DefOp.getReg();
+ Register OldDefReg = DefOp.getReg();
auto *DefRC = MRI->getRegClass(OldDefReg);
// Because we want to completely replace the uses of this def'ed value with
// the hardened value, create a dedicated new register that will only be used
// to communicate the unhardened value to the hardening.
- unsigned UnhardenedReg = MRI->createVirtualRegister(DefRC);
+ Register UnhardenedReg = MRI->createVirtualRegister(DefRC);
DefOp.setReg(UnhardenedReg);
// Now harden this register's value, getting a hardened reg that is safe to
@@ -2537,7 +2540,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
.addReg(ExpectedRetAddrReg, RegState::Kill)
.addSym(RetSymbol);
} else {
- unsigned ActualRetAddrReg = MRI->createVirtualRegister(AddrRC);
+ Register ActualRetAddrReg = MRI->createVirtualRegister(AddrRC);
BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ActualRetAddrReg)
.addReg(/*Base*/ X86::RIP)
.addImm(/*Scale*/ 1)
@@ -2554,7 +2557,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
- unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+ Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg)
.addReg(NewStateReg, RegState::Kill)
.addReg(PS->PoisonReg)
@@ -2611,7 +2614,7 @@ void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
// For all of these, the target register is the first operand of the
// instruction.
auto &TargetOp = MI.getOperand(0);
- unsigned OldTargetReg = TargetOp.getReg();
+ Register OldTargetReg = TargetOp.getReg();
// Try to lookup a hardened version of this register. We retain a reference
// here as we want to update the map to track any newly computed hardened
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index d5bb56603df9..f8f78da52cc2 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -146,6 +146,9 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
return X86II::MO_DLLIMPORT;
return X86II::MO_COFFSTUB;
}
+ // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables.
+ if (isOSWindows())
+ return X86II::MO_NO_FLAG;
if (is64Bit()) {
// ELF supports a large, truly PIC code model with non-PC relative GOT
@@ -285,10 +288,10 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
// Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both
// 32 and 64 bit) and for all 64-bit targets.
if (StackAlignOverride)
- stackAlignment = StackAlignOverride;
+ stackAlignment = *StackAlignOverride;
else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
isTargetKFreeBSD() || In64BitMode)
- stackAlignment = 16;
+ stackAlignment = Align(16);
// Some CPUs have more overhead for gather. The specified overhead is relative
// to the Load operation. "2" is the number provided by Intel architects. This
@@ -304,6 +307,8 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
// Consume the vector width attribute or apply any target specific limit.
if (PreferVectorWidthOverride)
PreferVectorWidth = PreferVectorWidthOverride;
+ else if (Prefer128Bit)
+ PreferVectorWidth = 128;
else if (Prefer256Bit)
PreferVectorWidth = 256;
}
@@ -316,12 +321,11 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
const X86TargetMachine &TM,
- unsigned StackAlignOverride,
+ MaybeAlign StackAlignOverride,
unsigned PreferVectorWidthOverride,
unsigned RequiredVectorWidth)
- : X86GenSubtargetInfo(TT, CPU, FS),
- PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
- StackAlignOverride(StackAlignOverride),
+ : X86GenSubtargetInfo(TT, CPU, FS), PICStyle(PICStyles::None), TM(TM),
+ TargetTriple(TT), StackAlignOverride(StackAlignOverride),
PreferVectorWidthOverride(PreferVectorWidthOverride),
RequiredVectorWidth(RequiredVectorWidth),
In64BitMode(TargetTriple.getArch() == Triple::x86_64),
@@ -355,7 +359,7 @@ const CallLowering *X86Subtarget::getCallLowering() const {
return CallLoweringInfo.get();
}
-const InstructionSelector *X86Subtarget::getInstructionSelector() const {
+InstructionSelector *X86Subtarget::getInstructionSelector() const {
return InstSelector.get();
}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 24ccc9cb7843..e8efe8f2afe5 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -365,8 +365,8 @@ protected:
/// Processor has AVX-512 vp2intersect instructions
bool HasVP2INTERSECT = false;
- /// Processor supports MPX - Memory Protection Extensions
- bool HasMPX = false;
+ /// Deprecated flag for MPX instructions.
+ bool DeprecatedHasMPX = false;
/// Processor supports CET SHSTK - Control-Flow Enforcement Technology
/// using Shadow Stack
@@ -427,15 +427,21 @@ protected:
/// Use software floating point for code generation.
bool UseSoftFloat = false;
+ /// Use alias analysis during code generation.
+ bool UseAA = false;
+
/// The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
- unsigned stackAlignment = 4;
+ Align stackAlignment = Align(4);
/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
///
// FIXME: this is a known good value for Yonah. How about others?
unsigned MaxInlineSizeThreshold = 128;
+ /// Indicates target prefers 128 bit instructions.
+ bool Prefer128Bit = false;
+
/// Indicates target prefers 256 bit instructions.
bool Prefer256Bit = false;
@@ -453,7 +459,7 @@ protected:
private:
/// Override the stack alignment.
- unsigned StackAlignOverride;
+ MaybeAlign StackAlignOverride;
/// Preferred vector width from function attribute.
unsigned PreferVectorWidthOverride;
@@ -490,7 +496,7 @@ public:
/// of the specified triple.
///
X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
- const X86TargetMachine &TM, unsigned StackAlignOverride,
+ const X86TargetMachine &TM, MaybeAlign StackAlignOverride,
unsigned PreferVectorWidthOverride,
unsigned RequiredVectorWidth);
@@ -515,7 +521,7 @@ public:
/// Returns the minimum alignment known to hold of the
/// stack frame on entry to the function and which must be maintained by every
/// function for this subtarget.
- unsigned getStackAlignment() const { return stackAlignment; }
+ Align getStackAlignment() const { return stackAlignment; }
/// Returns the maximum memset / memcpy size
/// that still makes it profitable to inline the call.
@@ -527,7 +533,7 @@ public:
/// Methods used by Global ISel
const CallLowering *getCallLowering() const override;
- const InstructionSelector *getInstructionSelector() const override;
+ InstructionSelector *getInstructionSelector() const override;
const LegalizerInfo *getLegalizerInfo() const override;
const RegisterBankInfo *getRegBankInfo() const override;
@@ -684,7 +690,6 @@ public:
bool hasBF16() const { return HasBF16; }
bool hasVP2INTERSECT() const { return HasVP2INTERSECT; }
bool hasBITALG() const { return HasBITALG; }
- bool hasMPX() const { return HasMPX; }
bool hasSHSTK() const { return HasSHSTK; }
bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
bool hasCLWB() const { return HasCLWB; }
@@ -739,6 +744,7 @@ public:
X86ProcFamily == IntelTRM;
}
bool useSoftFloat() const { return UseSoftFloat; }
+ bool useAA() const override { return UseAA; }
/// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
/// no-sse2). There isn't any reason to disable it if the target processor
@@ -809,6 +815,7 @@ public:
// On Win64, all these conventions just use the default convention.
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::Tail:
case CallingConv::Swift:
case CallingConv::X86_FastCall:
case CallingConv::X86_StdCall:
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 0cbf13899a29..c15297134e4d 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -81,27 +81,28 @@ extern "C" void LLVMInitializeX86Target() {
initializeX86SpeculativeLoadHardeningPassPass(PR);
initializeX86FlagsCopyLoweringPassPass(PR);
initializeX86CondBrFoldingPassPass(PR);
+ initializeX86OptimizeLEAPassPass(PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
if (TT.isOSBinFormatMachO()) {
if (TT.getArch() == Triple::x86_64)
- return llvm::make_unique<X86_64MachoTargetObjectFile>();
- return llvm::make_unique<TargetLoweringObjectFileMachO>();
+ return std::make_unique<X86_64MachoTargetObjectFile>();
+ return std::make_unique<TargetLoweringObjectFileMachO>();
}
if (TT.isOSFreeBSD())
- return llvm::make_unique<X86FreeBSDTargetObjectFile>();
+ return std::make_unique<X86FreeBSDTargetObjectFile>();
if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
- return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
+ return std::make_unique<X86LinuxNaClTargetObjectFile>();
if (TT.isOSSolaris())
- return llvm::make_unique<X86SolarisTargetObjectFile>();
+ return std::make_unique<X86SolarisTargetObjectFile>();
if (TT.isOSFuchsia())
- return llvm::make_unique<X86FuchsiaTargetObjectFile>();
+ return std::make_unique<X86FuchsiaTargetObjectFile>();
if (TT.isOSBinFormatELF())
- return llvm::make_unique<X86ELFTargetObjectFile>();
+ return std::make_unique<X86ELFTargetObjectFile>();
if (TT.isOSBinFormatCOFF())
- return llvm::make_unique<TargetLoweringObjectFileCOFF>();
+ return std::make_unique<TargetLoweringObjectFileCOFF>();
llvm_unreachable("unknown subtarget type");
}
@@ -116,6 +117,9 @@ static std::string computeDataLayout(const Triple &TT) {
!TT.isArch64Bit())
Ret += "-p:32:32";
+ // Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers.
+ Ret += "-p270:32:32-p271:32:32-p272:64:64";
+
// Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
Ret += "-i64:64";
@@ -218,17 +222,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64),
OL),
TLOF(createTLOF(getTargetTriple())) {
- // Windows stack unwinder gets confused when execution flow "falls through"
- // after a call to 'noreturn' function.
- // To prevent that, we emit a trap for 'unreachable' IR instructions.
- // (which on X86, happens to be the 'ud2' instruction)
// On PS4, the "return address" of a 'noreturn' call must still be within
// the calling function, and TrapUnreachable is an easy way to get that.
- // The check here for 64-bit windows is a bit icky, but as we're unlikely
- // to ever want to mix 32 and 64-bit windows code in a single module
- // this should be fine.
- if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4() ||
- TT.isOSBinFormatMachO()) {
+ if (TT.isPS4() || TT.isOSBinFormatMachO()) {
this->Options.TrapUnreachable = true;
this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO();
}
@@ -311,10 +307,10 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
- Options.StackAlignmentOverride,
- PreferVectorWidthOverride,
- RequiredVectorWidth);
+ I = std::make_unique<X86Subtarget>(
+ TargetTriple, CPU, FS, *this,
+ MaybeAlign(Options.StackAlignmentOverride), PreferVectorWidthOverride,
+ RequiredVectorWidth);
}
return I.get();
}
@@ -517,12 +513,19 @@ void X86PassConfig::addPreEmitPass() {
}
void X86PassConfig::addPreEmitPass2() {
+ const Triple &TT = TM->getTargetTriple();
+ const MCAsmInfo *MAI = TM->getMCAsmInfo();
+
addPass(createX86RetpolineThunksPass());
+
+ // Insert extra int3 instructions after trailing call instructions to avoid
+ // issues in the unwinder.
+ if (TT.isOSWindows() && TT.getArch() == Triple::x86_64)
+ addPass(createX86AvoidTrailingCallPass());
+
// Verify basic block incoming and outgoing cfa offset and register values and
// correct CFA calculation rule where needed by inserting appropriate CFI
// instructions.
- const Triple &TT = TM->getTargetTriple();
- const MCAsmInfo *MAI = TM->getMCAsmInfo();
if (!TT.isOSDarwin() &&
(!TT.isOSWindows() ||
MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI))
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index b999e2e86af6..ec3db7b1e9e8 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -16,7 +16,6 @@
#include "X86Subtarget.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/StringMap.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Target/TargetMachine.h"
#include <memory>
@@ -26,6 +25,7 @@ namespace llvm {
class StringRef;
class X86Subtarget;
class X86RegisterBankInfo;
+class TargetTransformInfo;
class X86TargetMachine final : public LLVMTargetMachine {
std::unique_ptr<TargetLoweringObjectFile> TLOF;
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 92e0779c2e74..44185957686b 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -47,8 +47,8 @@ MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol(
}
const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
- const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
- MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+ const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV,
+ int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
// On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry
// from a data section. In case there's an additional offset, then use
// foo@GOTPCREL+4+<offset>.
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 13d7b4ad70d6..1fd0bbf56b19 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -30,7 +30,8 @@ namespace llvm {
const TargetMachine &TM,
MachineModuleInfo *MMI) const override;
- const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+ const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV,
+ const MCSymbol *Sym,
const MCValue &MV, int64_t Offset,
MachineModuleInfo *MMI,
MCStreamer &Streamer) const override;
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 3dc59aeb263e..70fd857fcf01 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -116,7 +116,8 @@ llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
}
-unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
+unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (Vector && !ST->hasSSE1())
return 0;
@@ -887,7 +888,7 @@ int X86TTIImpl::getArithmeticInstrCost(
int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
- // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+ // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
// Treat Transpose as 2-op shuffles - there's no difference in lowering.
@@ -911,6 +912,39 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
int NumSubElts = SubLT.second.getVectorNumElements();
if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
return SubLT.first;
+ // Handle some cases for widening legalization. For now we only handle
+ // cases where the original subvector was naturally aligned and evenly
+ // fit in its legalized subvector type.
+ // FIXME: Remove some of the alignment restrictions.
+ // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
+ // vectors.
+ int OrigSubElts = SubTp->getVectorNumElements();
+ if (NumSubElts > OrigSubElts &&
+ (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 &&
+ LT.second.getVectorElementType() ==
+ SubLT.second.getVectorElementType() &&
+ LT.second.getVectorElementType().getSizeInBits() ==
+ Tp->getVectorElementType()->getPrimitiveSizeInBits()) {
+ assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
+ "Unexpected number of elements!");
+ Type *VecTy = VectorType::get(Tp->getVectorElementType(),
+ LT.second.getVectorNumElements());
+ Type *SubTy = VectorType::get(Tp->getVectorElementType(),
+ SubLT.second.getVectorNumElements());
+ int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
+ int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
+ ExtractIndex, SubTy);
+
+ // If the original size is 32-bits or more, we can use pshufd. Otherwise
+ // if we have SSSE3 we can use pshufb.
+ if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
+ return ExtractCost + 1; // pshufd or pshufb
+
+ assert(SubTp->getPrimitiveSizeInBits() == 16 &&
+ "Unexpected vector size");
+
+ return ExtractCost + 2; // worst case pshufhw + pshufd
+ }
}
}
@@ -1314,8 +1348,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
@@ -1354,6 +1390,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
@@ -1371,14 +1409,14 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
@@ -1402,13 +1440,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
@@ -1421,7 +1459,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 },
{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
@@ -1507,6 +1548,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
};
@@ -1520,7 +1562,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
@@ -1536,6 +1579,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
@@ -1562,15 +1607,21 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+3*PACKUSWB
+ { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
+ { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
+ { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD
};
std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
@@ -1691,6 +1742,11 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
}
}
+ static const CostTblEntry SLMCostTbl[] = {
+ // slm pcmpeq/pcmpgt throughput is 2
+ { ISD::SETCC, MVT::v2i64, 2 },
+ };
+
static const CostTblEntry AVX512BWCostTbl[] = {
{ ISD::SETCC, MVT::v32i16, 1 },
{ ISD::SETCC, MVT::v64i8, 1 },
@@ -1777,6 +1833,10 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
{ ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
};
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
+
if (ST->hasBWI())
if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
return LT.first * (ExtraCost + Entry->Cost);
@@ -2043,8 +2103,26 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
};
+ static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
+ { ISD::CTLZ, MVT::i64, 1 },
+ };
+ static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
+ { ISD::CTLZ, MVT::i32, 1 },
+ { ISD::CTLZ, MVT::i16, 1 },
+ { ISD::CTLZ, MVT::i8, 1 },
+ };
+ static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
+ { ISD::CTPOP, MVT::i64, 1 },
+ };
+ static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
+ { ISD::CTPOP, MVT::i32, 1 },
+ { ISD::CTPOP, MVT::i16, 1 },
+ { ISD::CTPOP, MVT::i8, 1 },
+ };
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
{ ISD::BITREVERSE, MVT::i64, 14 },
+ { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTPOP, MVT::i64, 10 },
{ ISD::SADDO, MVT::i64, 1 },
{ ISD::UADDO, MVT::i64, 1 },
};
@@ -2052,6 +2130,12 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::BITREVERSE, MVT::i32, 14 },
{ ISD::BITREVERSE, MVT::i16, 14 },
{ ISD::BITREVERSE, MVT::i8, 11 },
+ { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTPOP, MVT::i32, 8 },
+ { ISD::CTPOP, MVT::i16, 9 },
+ { ISD::CTPOP, MVT::i8, 7 },
{ ISD::SADDO, MVT::i32, 1 },
{ ISD::SADDO, MVT::i16, 1 },
{ ISD::SADDO, MVT::i8, 1 },
@@ -2163,6 +2247,26 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
+ if (ST->hasLZCNT()) {
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
+ if (ST->hasPOPCNT()) {
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
+ // TODO - add BMI (TZCNT) scalar handling
+
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
@@ -2357,8 +2461,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
unsigned NumElem = SrcVTy->getVectorNumElements();
VectorType *MaskTy =
VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
- if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) ||
- (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) {
+ if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) ||
+ (IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) ||
+ !isPowerOf2_32(NumElem)) {
// Scalarization
int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
int ScalarCompareCost = getCmpSelInstrCost(
@@ -2425,70 +2530,107 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
bool IsPairwise) {
-
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
-
- MVT MTy = LT.second;
-
- int ISD = TLI->InstructionOpcodeToISD(Opcode);
- assert(ISD && "Invalid opcode");
-
// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
// and make it as the cost.
- static const CostTblEntry SSE42CostTblPairWise[] = {
+ static const CostTblEntry SSE2CostTblPairWise[] = {
{ ISD::FADD, MVT::v2f64, 2 },
{ ISD::FADD, MVT::v4f32, 4 },
{ ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
+ { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32.
{ ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
+ { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16
+ { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16
{ ISD::ADD, MVT::v8i16, 5 },
+ { ISD::ADD, MVT::v2i8, 2 },
+ { ISD::ADD, MVT::v4i8, 2 },
+ { ISD::ADD, MVT::v8i8, 2 },
+ { ISD::ADD, MVT::v16i8, 3 },
};
static const CostTblEntry AVX1CostTblPairWise[] = {
- { ISD::FADD, MVT::v4f32, 4 },
{ ISD::FADD, MVT::v4f64, 5 },
{ ISD::FADD, MVT::v8f32, 7 },
{ ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
- { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
{ ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
- { ISD::ADD, MVT::v8i16, 5 },
{ ISD::ADD, MVT::v8i32, 5 },
+ { ISD::ADD, MVT::v16i16, 6 },
+ { ISD::ADD, MVT::v32i8, 4 },
};
- static const CostTblEntry SSE42CostTblNoPairWise[] = {
+ static const CostTblEntry SSE2CostTblNoPairWise[] = {
{ ISD::FADD, MVT::v2f64, 2 },
{ ISD::FADD, MVT::v4f32, 4 },
{ ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
+ { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
{ ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
+ { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
+ { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
{ ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
+ { ISD::ADD, MVT::v2i8, 2 },
+ { ISD::ADD, MVT::v4i8, 2 },
+ { ISD::ADD, MVT::v8i8, 2 },
+ { ISD::ADD, MVT::v16i8, 3 },
};
static const CostTblEntry AVX1CostTblNoPairWise[] = {
- { ISD::FADD, MVT::v4f32, 3 },
{ ISD::FADD, MVT::v4f64, 3 },
+ { ISD::FADD, MVT::v4f32, 3 },
{ ISD::FADD, MVT::v8f32, 4 },
{ ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
- { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
{ ISD::ADD, MVT::v4i64, 3 },
- { ISD::ADD, MVT::v8i16, 4 },
{ ISD::ADD, MVT::v8i32, 5 },
+ { ISD::ADD, MVT::v16i16, 5 },
+ { ISD::ADD, MVT::v32i8, 4 },
};
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ // Before legalizing the type, give a chance to look up illegal narrow types
+ // in the table.
+ // FIXME: Is there a better way to do this?
+ EVT VT = TLI->getValueType(DL, ValTy);
+ if (VT.isSimple()) {
+ MVT MTy = VT.getSimpleVT();
+ if (IsPairwise) {
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
+ return Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
+ return Entry->Cost;
+ } else {
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+ }
+ }
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
if (IsPairwise) {
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
return LT.first * Entry->Cost;
- if (ST->hasSSE42())
- if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
return LT.first * Entry->Cost;
} else {
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
return LT.first * Entry->Cost;
- if (ST->hasSSE42())
- if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
return LT.first * Entry->Cost;
}
@@ -3116,7 +3258,7 @@ bool X86TTIImpl::canMacroFuseCmp() {
return ST->hasMacroFusion() || ST->hasBranchFusion();
}
-bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
if (!ST->hasAVX())
return false;
@@ -3139,11 +3281,11 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
}
-bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
- return isLegalMaskedLoad(DataType);
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) {
+ return isLegalMaskedLoad(DataType, Alignment);
}
-bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
+bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
unsigned DataSize = DL.getTypeStoreSize(DataType);
// The only supported nontemporal loads are for aligned vectors of 16 or 32
// bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
@@ -3154,7 +3296,7 @@ bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
return false;
}
-bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) {
+bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
unsigned DataSize = DL.getTypeStoreSize(DataType);
// SSE4A supports nontemporal stores of float and double at arbitrary
@@ -3299,9 +3441,8 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
if (IsZeroCmp) {
// Only enable vector loads for equality comparison. Right now the vector
// version is not as fast for three way compare (see #33329).
- // TODO: enable AVX512 when the DAG is ready.
- // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
const unsigned PreferredWidth = ST->getPreferVectorWidth();
+ if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32);
if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
// All GPR and vector loads can be unaligned. SIMD compare requires integer
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 25d9c33eb16d..7581257f41f8 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -83,6 +83,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::FeatureSlowUAMem32,
// Based on whether user set the -mprefer-vector-width command line.
+ X86::FeaturePrefer128Bit,
X86::FeaturePrefer256Bit,
// CPU name enums. These just follow CPU string.
@@ -115,7 +116,7 @@ public:
/// \name Vector TTI Implementations
/// @{
- unsigned getNumberOfRegisters(bool Vector);
+ unsigned getNumberOfRegisters(unsigned ClassID) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
unsigned getMaxInterleaveFactor(unsigned VF);
@@ -184,10 +185,10 @@ public:
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2);
bool canMacroFuseCmp();
- bool isLegalMaskedLoad(Type *DataType);
- bool isLegalMaskedStore(Type *DataType);
- bool isLegalNTLoad(Type *DataType, unsigned Alignment);
- bool isLegalNTStore(Type *DataType, unsigned Alignment);
+ bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment);
+ bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment);
+ bool isLegalNTLoad(Type *DataType, Align Alignment);
+ bool isLegalNTStore(Type *DataType, Align Alignment);
bool isLegalMaskedGather(Type *DataType);
bool isLegalMaskedScatter(Type *DataType);
bool isLegalMaskedExpandLoad(Type *DataType);
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index a07d2f20acab..9280d030b5d5 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -292,8 +292,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
// need to insert any VZEROUPPER instructions. This is constant-time, so it
// is cheap in the common case of no ymm/zmm use.
bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm;
- const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass};
- for (auto *RC : RCs) {
+ for (auto *RC : {&X86::VR256RegClass, &X86::VR512_0_15RegClass}) {
if (!YmmOrZmmUsed) {
for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
i++) {
@@ -304,9 +303,8 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
}
}
}
- if (!YmmOrZmmUsed) {
+ if (!YmmOrZmmUsed)
return false;
- }
assert(BlockStates.empty() && DirtySuccessors.empty() &&
"X86VZeroUpper state should be clear");
diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp
index 9e499db1d7ee..ae72c6427588 100644
--- a/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -81,7 +81,7 @@ static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) {
MI->getOpcode() == X86::WIN_ALLOCA_64);
assert(MI->getOperand(0).isReg());
- unsigned AmountReg = MI->getOperand(0).getReg();
+ Register AmountReg = MI->getOperand(0).getReg();
MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg);
if (!Def ||
@@ -261,7 +261,7 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
break;
}
- unsigned AmountReg = MI->getOperand(0).getReg();
+ Register AmountReg = MI->getOperand(0).getReg();
MI->eraseFromParent();
// Delete the definition of AmountReg.
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index f68d17d7256d..d65e1f3ab414 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -339,7 +339,10 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
if (UseStackGuard) {
Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
Value *FrameAddr = Builder.CreateCall(
- Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress),
+ Intrinsic::getDeclaration(
+ TheModule, Intrinsic::frameaddress,
+ Builder.getInt8PtrTy(
+ TheModule->getDataLayout().getAllocaAddrSpace())),
Builder.getInt32(0), "frameaddr");
Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty);
FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val);
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 9f615b9e7741..6b3dc27cb886 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -115,7 +115,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
MCSymbol *GVSym = getSymbol(GV);
const Constant *C = GV->getInitializer();
- unsigned Align = (unsigned)DL.getPreferredTypeAlignmentShift(C->getType());
+ const Align Alignment(DL.getPrefTypeAlignment(C->getType()));
// Mark the start of the global
getTargetStreamer().emitCCTopData(GVSym->getName());
@@ -143,7 +143,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
llvm_unreachable("Unknown linkage type!");
}
- EmitAlignment(Align > 2 ? Align : 2, GV);
+ EmitAlignment(std::max(Alignment, Align(4)), GV);
if (GV->isThreadLocal()) {
report_fatal_error("TLS is not supported by this target!");
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 5066407c74aa..fd8b37e26e47 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -211,7 +211,7 @@ static void RestoreSpillList(MachineBasicBlock &MBB,
//===----------------------------------------------------------------------===//
XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti)
- : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0) {
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(4), 0) {
// Do nothing
}
@@ -367,8 +367,8 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
// Return to the landing pad.
- unsigned EhStackReg = MBBI->getOperand(0).getReg();
- unsigned EhHandlerReg = MBBI->getOperand(1).getReg();
+ Register EhStackReg = MBBI->getOperand(0).getReg();
+ Register EhHandlerReg = MBBI->getOperand(1).getReg();
BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r)).addReg(EhStackReg);
BuildMI(MBB, MBBI, dl, TII.get(XCore::BAU_1r)).addReg(EhHandlerReg);
MBB.erase(MBBI); // Erase the previous return instruction.
diff --git a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
index e433d21c59b7..b5dbdea98eea 100644
--- a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
+++ b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
@@ -55,7 +55,7 @@ bool XCoreFTAOElim::runOnMachineFunction(MachineFunction &MF) {
MBBI != EE; ++MBBI) {
if (MBBI->getOpcode() == XCore::FRAME_TO_ARGS_OFFSET) {
MachineInstr &OldInst = *MBBI;
- unsigned Reg = OldInst.getOperand(0).getReg();
+ Register Reg = OldInst.getOperand(0).getReg();
MBBI = TII.loadImmediate(MBB, MBBI, Reg, StackSize);
OldInst.eraseFromParent();
}
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 072278d9fc46..bf006fd673f1 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -171,8 +171,8 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
- setMinFunctionAlignment(1);
- setPrefFunctionAlignment(2);
+ setMinFunctionAlignment(Align(2));
+ setPrefFunctionAlignment(Align(4));
}
bool XCoreTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
@@ -414,8 +414,8 @@ SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
"Unexpected extension type");
assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT");
- if (allowsMemoryAccess(Context, DAG.getDataLayout(), LD->getMemoryVT(),
- *LD->getMemOperand()))
+ if (allowsMemoryAccessForAlignment(Context, DAG.getDataLayout(),
+ LD->getMemoryVT(), *LD->getMemOperand()))
return SDValue();
SDValue Chain = LD->getChain();
@@ -488,8 +488,8 @@ SDValue XCoreTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
assert(!ST->isTruncatingStore() && "Unexpected store type");
assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT");
- if (allowsMemoryAccess(Context, DAG.getDataLayout(), ST->getMemoryVT(),
- *ST->getMemOperand()))
+ if (allowsMemoryAccessForAlignment(Context, DAG.getDataLayout(),
+ ST->getMemoryVT(), *ST->getMemOperand()))
return SDValue();
SDValue Chain = ST->getChain();
@@ -1309,7 +1309,7 @@ SDValue XCoreTargetLowering::LowerCCCArguments(
llvm_unreachable(nullptr);
}
case MVT::i32:
- unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
+ Register VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
RegInfo.addLiveIn(VA.getLocReg(), VReg);
ArgIn = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
CFRegNode.push_back(ArgIn.getValue(ArgIn->getNumValues() - 1));
@@ -1360,7 +1360,7 @@ SDValue XCoreTargetLowering::LowerCCCArguments(
offset -= StackSlotSize;
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
// Move argument from phys reg -> virt reg
- unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
+ Register VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
RegInfo.addLiveIn(ArgRegs[i], VReg);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
CFRegNode.push_back(Val.getValue(Val->getNumValues() - 1));
@@ -1780,8 +1780,9 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
// Replace unaligned store of unaligned load with memmove.
StoreSDNode *ST = cast<StoreSDNode>(N);
if (!DCI.isBeforeLegalize() ||
- allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
- ST->getMemoryVT(), *ST->getMemOperand()) ||
+ allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ ST->getMemoryVT(),
+ *ST->getMemOperand()) ||
ST->isVolatile() || ST->isIndexed()) {
break;
}
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 3752274e2cdf..86ec7f82d4d1 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -301,7 +301,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
<< "<--------->\n");
Offset/=4;
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
assert(XCore::GRRegsRegClass.contains(Reg) && "Unexpected register operand");
if (TFI->hasFP(MF)) {
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 2a8cd6b657b7..b5b7445265b7 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -53,7 +53,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, const Triple &TT,
T, "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32",
TT, CPU, FS, Options, getEffectiveRelocModel(RM),
getEffectiveXCoreCodeModel(CM), OL),
- TLOF(llvm::make_unique<XCoreTargetObjectFile>()),
+ TLOF(std::make_unique<XCoreTargetObjectFile>()),
Subtarget(TT, CPU, FS, *this) {
initAsmInfo();
}
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.h b/lib/Target/XCore/XCoreTargetTransformInfo.h
index 3fecaaa59722..58df1f290ec9 100644
--- a/lib/Target/XCore/XCoreTargetTransformInfo.h
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -40,7 +40,8 @@ public:
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
TLI(ST->getTargetLowering()) {}
- unsigned getNumberOfRegisters(bool Vector) {
+ unsigned getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (Vector) {
return 0;
}