aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64.td11
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp239
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrAtomics.td38
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td73
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedA53.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedA55.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedA57.td7
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedA64FX.td4
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedAmpere1.td4
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedCyclone.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedExynosM3.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedExynosM4.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedExynosM5.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedFalkor.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedKryo.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td2279
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedTSV110.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedThunderX.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td3
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp47
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h6
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp29
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h4
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp57
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp35
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp14
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h1
-rw-r--r--llvm/lib/Target/AArch64/SMEInstrFormats.td34
-rw-r--r--llvm/lib/Target/AArch64/SVEInstrFormats.td10
-rw-r--r--llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp53
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp47
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td76
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td27
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td77
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td27
-rw-r--r--llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp175
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp127
-rw-r--r--llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp212
-rw-r--r--llvm/lib/Target/AMDGPU/GCNVOPDUtils.h32
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h6
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h1
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.h4
-rw-r--r--llvm/lib/Target/AMDGPU/R600MCInstLower.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td125
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td37
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp524
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h14
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td2
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td6
-rw-r--r--llvm/lib/Target/ARC/ARCAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/ARM/ARM.td11
-rw-r--r--llvm/lib/Target/ARM/ARMAsmPrinter.cpp4
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp14
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp6
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp8
-rw-r--r--llvm/lib/Target/AVR/AVRAsmPrinter.cpp4
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp79
-rw-r--r--llvm/lib/Target/BPF/BPFAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/BPF/BTF.h2
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp15
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/CSKY/CSKYInstrInfo.td2
-rw-r--r--llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td2
-rw-r--r--llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/DirectX/DXIL.td8
-rw-r--r--llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp4
-rw-r--r--llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h5
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp4
-rw-r--r--llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h4
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td52
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td54
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp172
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchFrameLowering.h18
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp56
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h5
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp569
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.h26
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp70
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.h10
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchInstrInfo.td218
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp33
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp24
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp7
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/M68k/M68kAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h1
-rw-r--r--llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.cpp4
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTX.h1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp4
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXUtilities.cpp54
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp10
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h6
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp48
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp9
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.h1
-rw-r--r--llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp20
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp10
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.cpp17
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.h4
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp646
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h13
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp105
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h6
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td1
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoV.td85
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td36
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td16
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZb.td28
-rw-r--r--llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp10
-rw-r--r--llvm/lib/Target/RISCV/RISCVScheduleV.td28
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp10
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp95
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h174
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp52
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h41
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp34
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp101
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h6
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp15
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/Sparc/SparcAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp17
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/SystemZ/SystemZCallingConv.td6
-rw-r--r--llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp113
-rw-r--r--llvm/lib/Target/TargetLoweringObjectFile.cpp7
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp15
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/VE/VEAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/VE/VEInstrInfo.td8
-rw-r--r--llvm/lib/Target/VE/VERegisterInfo.cpp203
-rw-r--r--llvm/lib/Target/VE/VVPISelLowering.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp7
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp1
-rw-r--r--llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp33
-rw-r--r--llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h8
-rw-r--r--llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp22
-rw-r--r--llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp22
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp18
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp25
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp27
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp17
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp2
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp8
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/X86/X86.h4
-rw-r--r--llvm/lib/Target/X86/X86.td3
-rw-r--r--llvm/lib/Target/X86/X86EvexToVex.cpp1
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp193
-rw-r--r--llvm/lib/Target/X86/X86InstrCompiler.td38
-rw-r--r--llvm/lib/Target/X86/X86InstrFMA3Info.cpp2
-rw-r--r--llvm/lib/Target/X86/X86InstrFoldTables.cpp8
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.td1
-rw-r--r--llvm/lib/Target/X86/X86InstrSystem.td9
-rw-r--r--llvm/lib/Target/X86/X86IntrinsicsInfo.h3
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp4
-rw-r--r--llvm/lib/Target/X86/X86PartialReduction.cpp6
-rw-r--r--llvm/lib/Target/X86/X86ReturnThunks.cpp92
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp2
-rw-r--r--llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/XCore/XCoreAsmPrinter.cpp3
246 files changed, 7178 insertions, 1735 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index f092c039b58e..b332e9dcb176 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -650,6 +650,7 @@ include "AArch64SchedA64FX.td"
include "AArch64SchedThunderX3T110.td"
include "AArch64SchedTSV110.td"
include "AArch64SchedAmpere1.td"
+include "AArch64SchedNeoverseN2.td"
def TuneA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
"Cortex-A35 ARM processors">;
@@ -1137,7 +1138,7 @@ def : ProcessorModel<"cortex-a78", CortexA57Model, ProcessorFeatures.A78,
[TuneA78]>;
def : ProcessorModel<"cortex-a78c", CortexA57Model, ProcessorFeatures.A78C,
[TuneA78C]>;
-def : ProcessorModel<"cortex-a710", CortexA57Model, ProcessorFeatures.A710,
+def : ProcessorModel<"cortex-a710", NeoverseN2Model, ProcessorFeatures.A710,
[TuneA710]>;
def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82,
[TuneR82]>;
@@ -1145,17 +1146,17 @@ def : ProcessorModel<"cortex-x1", CortexA57Model, ProcessorFeatures.X1,
[TuneX1]>;
def : ProcessorModel<"cortex-x1c", CortexA57Model, ProcessorFeatures.X1C,
[TuneX1]>;
-def : ProcessorModel<"cortex-x2", CortexA57Model, ProcessorFeatures.X2,
+def : ProcessorModel<"cortex-x2", NeoverseN2Model, ProcessorFeatures.X2,
[TuneX2]>;
def : ProcessorModel<"neoverse-e1", CortexA53Model,
ProcessorFeatures.NeoverseE1, [TuneNeoverseE1]>;
def : ProcessorModel<"neoverse-n1", CortexA57Model,
ProcessorFeatures.NeoverseN1, [TuneNeoverseN1]>;
-def : ProcessorModel<"neoverse-n2", CortexA57Model,
+def : ProcessorModel<"neoverse-n2", NeoverseN2Model,
ProcessorFeatures.NeoverseN2, [TuneNeoverseN2]>;
-def : ProcessorModel<"neoverse-512tvb", CortexA57Model,
+def : ProcessorModel<"neoverse-512tvb", NeoverseN2Model,
ProcessorFeatures.Neoverse512TVB, [TuneNeoverse512TVB]>;
-def : ProcessorModel<"neoverse-v1", CortexA57Model,
+def : ProcessorModel<"neoverse-v1", NeoverseN2Model,
ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>;
def : ProcessorModel<"exynos-m3", ExynosM3Model, ProcessorFeatures.ExynosM3,
[TuneExynosM3]>;
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index ef4860979dd3..c568f73471e1 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1173,6 +1173,8 @@ void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
#include "AArch64GenMCPseudoLowering.inc"
void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
+ AArch64_MC::verifyInstructionPredicates(MI->getOpcode(), STI->getFeatureBits());
+
// Do any auto-generated pseudo lowerings.
if (emitPseudoExpansionLowering(*OutStreamer, MI))
return;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index abfe2d507111..447ad10ddf22 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -237,6 +237,39 @@ static bool isMergePassthruOpcode(unsigned Opc) {
}
}
+// Returns true if inactive lanes are known to be zeroed by construction.
+static bool isZeroingInactiveLanes(SDValue Op) {
+ switch (Op.getOpcode()) {
+ default:
+ // We guarantee i1 splat_vectors to zero the other lanes by
+ // implementing it with ptrue and possibly a punpklo for nxv1i1.
+ if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
+ return true;
+ return false;
+ case AArch64ISD::PTRUE:
+ case AArch64ISD::SETCC_MERGE_ZERO:
+ return true;
+ case ISD::INTRINSIC_WO_CHAIN:
+ switch (Op.getConstantOperandVal(0)) {
+ default:
+ return false;
+ case Intrinsic::aarch64_sve_ptrue:
+ case Intrinsic::aarch64_sve_pnext:
+ case Intrinsic::aarch64_sve_cmpeq_wide:
+ case Intrinsic::aarch64_sve_cmpne_wide:
+ case Intrinsic::aarch64_sve_cmpge_wide:
+ case Intrinsic::aarch64_sve_cmpgt_wide:
+ case Intrinsic::aarch64_sve_cmplt_wide:
+ case Intrinsic::aarch64_sve_cmple_wide:
+ case Intrinsic::aarch64_sve_cmphs_wide:
+ case Intrinsic::aarch64_sve_cmphi_wide:
+ case Intrinsic::aarch64_sve_cmplo_wide:
+ case Intrinsic::aarch64_sve_cmpls_wide:
+ return true;
+ }
+ }
+}
+
AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -1082,6 +1115,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
}
+ // FIXME: Move lowering for more nodes here if those are common between
+ // SVE and SME.
+ if (Subtarget->hasSVE() || Subtarget->hasSME()) {
+ for (auto VT :
+ {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
+ setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ }
+ }
+
if (Subtarget->hasSVE()) {
for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
setOperationAction(ISD::BITREVERSE, VT, Custom);
@@ -1162,14 +1205,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
- setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
setOperationAction(ISD::SELECT_CC, VT, Expand);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
@@ -2429,6 +2470,23 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
return BB;
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitAddVectorToTile(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+
+ MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
+ MIB.addReg(BaseReg + MI.getOperand(0).getImm());
+ MIB.add(MI.getOperand(1)); // pn
+ MIB.add(MI.getOperand(2)); // pm
+ MIB.add(MI.getOperand(3)); // zn
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
switch (MI.getOpcode()) {
@@ -2561,6 +2619,14 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
BB);
case AArch64::ZERO_M_PSEUDO:
return EmitZero(MI, BB);
+ case AArch64::ADDHA_MPPZ_PSEUDO_S:
+ return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_S, AArch64::ZAS0, MI, BB);
+ case AArch64::ADDVA_MPPZ_PSEUDO_S:
+ return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_S, AArch64::ZAS0, MI, BB);
+ case AArch64::ADDHA_MPPZ_PSEUDO_D:
+ return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_D, AArch64::ZAD0, MI, BB);
+ case AArch64::ADDVA_MPPZ_PSEUDO_D:
+ return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_D, AArch64::ZAD0, MI, BB);
}
}
@@ -4329,55 +4395,49 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
int Pattern) {
+ if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
+ return DAG.getConstant(1, DL, MVT::nxv1i1);
return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
DAG.getTargetConstant(Pattern, DL, MVT::i32));
}
-static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) {
+// Returns a safe bitcast between two scalable vector predicates, where
+// any newly created lanes from a widening bitcast are defined as zero.
+static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
- EVT OutVT = Op.getValueType();
- SDValue InOp = Op.getOperand(1);
- EVT InVT = InOp.getValueType();
+ EVT InVT = Op.getValueType();
+
+ assert(InVT.getVectorElementType() == MVT::i1 &&
+ VT.getVectorElementType() == MVT::i1 &&
+ "Expected a predicate-to-predicate bitcast");
+ assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ InVT.isScalableVector() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
+ "Only expect to cast between legal scalable predicate types!");
// Return the operand if the cast isn't changing type,
- // i.e. <n x 16 x i1> -> <n x 16 x i1>
- if (InVT == OutVT)
- return InOp;
+ // e.g. <n x 16 x i1> -> <n x 16 x i1>
+ if (InVT == VT)
+ return Op;
- SDValue Reinterpret =
- DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, InOp);
+ SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
- // If the argument converted to an svbool is a ptrue or a comparison, the
- // lanes introduced by the widening are zero by construction.
- switch (InOp.getOpcode()) {
- case AArch64ISD::SETCC_MERGE_ZERO:
+ // We only have to zero the lanes if new lanes are being defined, e.g. when
+ // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
+ // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
+ // we can return here.
+ if (InVT.bitsGT(VT))
return Reinterpret;
- case ISD::INTRINSIC_WO_CHAIN:
- switch (InOp.getConstantOperandVal(0)) {
- case Intrinsic::aarch64_sve_ptrue:
- case Intrinsic::aarch64_sve_cmpeq_wide:
- case Intrinsic::aarch64_sve_cmpne_wide:
- case Intrinsic::aarch64_sve_cmpge_wide:
- case Intrinsic::aarch64_sve_cmpgt_wide:
- case Intrinsic::aarch64_sve_cmplt_wide:
- case Intrinsic::aarch64_sve_cmple_wide:
- case Intrinsic::aarch64_sve_cmphs_wide:
- case Intrinsic::aarch64_sve_cmphi_wide:
- case Intrinsic::aarch64_sve_cmplo_wide:
- case Intrinsic::aarch64_sve_cmpls_wide:
- return Reinterpret;
- }
- }
- // Splat vectors of one will generate ptrue instructions
- if (ISD::isConstantSplatVectorAllOnes(InOp.getNode()))
+ // Check if the other lanes are already known to be zeroed by
+ // construction.
+ if (isZeroingInactiveLanes(Op))
return Reinterpret;
- // Otherwise, zero the newly introduced lanes.
- SDValue Mask = getPTrue(DAG, DL, InVT, AArch64SVEPredPattern::all);
- SDValue MaskReinterpret =
- DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, Mask);
- return DAG.getNode(ISD::AND, DL, OutVT, Reinterpret, MaskReinterpret);
+ // Zero the newly introduced lanes.
+ SDValue Mask = DAG.getConstant(1, DL, InVT);
+ Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
+ return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
}
SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
@@ -4546,10 +4606,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::aarch64_sve_dupq_lane:
return LowerDUPQLane(Op, DAG);
case Intrinsic::aarch64_sve_convert_from_svbool:
- return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
- Op.getOperand(1));
+ return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
case Intrinsic::aarch64_sve_convert_to_svbool:
- return lowerConvertToSVBool(Op, DAG);
+ return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
case Intrinsic::aarch64_sve_fneg:
return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
@@ -6393,9 +6452,8 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
if (SizeInBits < 8)
return false;
- APInt LowBits(SizeInBits, 0xFF);
APInt RequredZero(SizeInBits, 0xFE);
- KnownBits Bits = DAG.computeKnownBits(Arg, LowBits, 4);
+ KnownBits Bits = DAG.computeKnownBits(Arg, 4);
bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
return ZExtBool;
}
@@ -14814,16 +14872,6 @@ static SDValue performANDCombine(SDNode *N,
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
- // Although NEON has no EORV instruction, when only the least significant bit
- // is required the operation is synonymous with ADDV.
- if (LHS.getOpcode() == ISD::VECREDUCE_XOR && isOneConstant(RHS) &&
- LHS.getOperand(0).getValueType().isFixedLengthVector() &&
- LHS.hasOneUse()) {
- SDLoc DL(N);
- SDValue ADDV = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, LHS.getOperand(0));
- return DAG.getNode(ISD::AND, DL, VT, ADDV, RHS);
- }
-
if (VT.isScalableVector())
return performSVEAndCombine(N, DCI);
@@ -16126,12 +16174,24 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
assert(Op.getValueType().isScalableVector() &&
TLI.isTypeLegal(Op.getValueType()) &&
"Expected legal scalable vector type!");
+ assert(Op.getValueType() == Pg.getValueType() &&
+ "Expected same type for PTEST operands");
// Ensure target specific opcodes are using legal type.
EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
SDValue TVal = DAG.getConstant(1, DL, OutVT);
SDValue FVal = DAG.getConstant(0, DL, OutVT);
+ // Ensure operands have type nxv16i1.
+ if (Op.getValueType() != MVT::nxv16i1) {
+ if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) &&
+ isZeroingInactiveLanes(Op))
+ Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
+ else
+ Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
+ Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
+ }
+
// Set condition code (CC) flags.
SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op);
@@ -18026,6 +18086,54 @@ static SDValue performCSELCombine(SDNode *N,
return performCONDCombine(N, DCI, DAG, 2, 3);
}
+// Try to re-use an already extended operand of a vector SetCC feeding a
+// extended select. Doing so avoids requiring another full extension of the
+// SET_CC result when lowering the select.
+static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
+ EVT Op0MVT = Op->getOperand(0).getValueType();
+ if (!Op0MVT.isVector() || Op->use_empty())
+ return SDValue();
+
+ // Make sure that all uses of Op are VSELECTs with result matching types where
+ // the result type has a larger element type than the SetCC operand.
+ SDNode *FirstUse = *Op->use_begin();
+ if (FirstUse->getOpcode() != ISD::VSELECT)
+ return SDValue();
+ EVT UseMVT = FirstUse->getValueType(0);
+ if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
+ return SDValue();
+ if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
+ return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
+ }))
+ return SDValue();
+
+ APInt V;
+ if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
+ return SDValue();
+
+ SDLoc DL(Op);
+ SDValue Op0ExtV;
+ SDValue Op1ExtV;
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
+ // Check if the first operand of the SET_CC is already extended. If it is,
+ // split the SET_CC and re-use the extended version of the operand.
+ SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
+ Op->getOperand(0));
+ SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
+ Op->getOperand(0));
+ if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
+ Op0ExtV = SDValue(Op0SExt, 0);
+ Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
+ } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
+ Op0ExtV = SDValue(Op0ZExt, 0);
+ Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
+ } else
+ return SDValue();
+
+ return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
+ Op0ExtV, Op1ExtV, Op->getOperand(2));
+}
+
static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
SDValue LHS = N->getOperand(0);
@@ -18034,6 +18142,9 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
+ if (SDValue V = tryToWidenSetCCOperands(N, DAG))
+ return V;
+
// setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
if (Cond == ISD::SETNE && isOneConstant(RHS) &&
LHS->getOpcode() == AArch64ISD::CSEL &&
@@ -21045,7 +21156,7 @@ SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
default:
return SDValue();
case ISD::VECREDUCE_OR:
- if (isAllActivePredicate(DAG, Pg))
+ if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
// The predicate can be 'Op' because
// vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
@@ -21058,6 +21169,11 @@ SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
case ISD::VECREDUCE_XOR: {
SDValue ID =
DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
+ if (OpVT == MVT::nxv1i1) {
+ // Emulate a CNTP on .Q using .D and a different governing predicate.
+ Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
+ Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
+ }
SDValue Cntp =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
@@ -21464,22 +21580,17 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT InVT = Op.getValueType();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- (void)TLI;
- assert(VT.isScalableVector() && TLI.isTypeLegal(VT) &&
- InVT.isScalableVector() && TLI.isTypeLegal(InVT) &&
+ assert(VT.isScalableVector() && isTypeLegal(VT) &&
+ InVT.isScalableVector() && isTypeLegal(InVT) &&
"Only expect to cast between legal scalable vector types!");
- assert((VT.getVectorElementType() == MVT::i1) ==
- (InVT.getVectorElementType() == MVT::i1) &&
- "Cannot cast between data and predicate scalable vector types!");
+ assert(VT.getVectorElementType() != MVT::i1 &&
+ InVT.getVectorElementType() != MVT::i1 &&
+ "For predicate bitcasts, use getSVEPredicateBitCast");
if (InVT == VT)
return Op;
- if (VT.getVectorElementType() == MVT::i1)
- return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
-
EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 06ea918ea32e..e02b5e56fd2e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -571,6 +571,9 @@ public:
MachineInstr &MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitAddVectorToTile(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const;
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
@@ -1148,6 +1151,7 @@ private:
// These can make "bitcasting" a multiphase process. REINTERPRET_CAST is used
// to transition between unpacked and packed types of the same element type,
// with BITCAST used otherwise.
+ // This function does not handle predicate bitcasts.
SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const;
bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index c477a44b13b2..6839e73796a6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -29,21 +29,21 @@ def : Pat<(atomic_fence (timm), (timm)), (DMB (i32 0xb))>;
// An atomic load operation that does not need either acquire or release
// semantics.
-class relaxed_load<PatFrag base>
+class relaxed_load<PatFrags base>
: PatFrag<(ops node:$ptr), (base node:$ptr)> {
let IsAtomic = 1;
let IsAtomicOrderingAcquireOrStronger = 0;
}
// A atomic load operation that actually needs acquire semantics.
-class acquiring_load<PatFrag base>
+class acquiring_load<PatFrags base>
: PatFrag<(ops node:$ptr), (base node:$ptr)> {
let IsAtomic = 1;
let IsAtomicOrderingAcquire = 1;
}
// An atomic load operation that needs sequential consistency.
-class seq_cst_load<PatFrag base>
+class seq_cst_load<PatFrags base>
: PatFrag<(ops node:$ptr), (base node:$ptr)> {
let IsAtomic = 1;
let IsAtomicOrderingSequentiallyConsistent = 1;
@@ -63,34 +63,34 @@ let Predicates = [HasLDAPR] in {
}
// 8-bit loads
-def : Pat<(seq_cst_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
-def : Pat<(acquiring_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+def : Pat<(seq_cst_load<atomic_load_az_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
+def : Pat<(acquiring_load<atomic_load_az_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_az_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
ro_Wextend8:$offset)),
(LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>;
-def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
- ro_Xextend8:$offset)),
+def : Pat<(relaxed_load<atomic_load_az_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend8:$offset)),
(LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>;
-def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn,
- uimm12s1:$offset)),
+def : Pat<(relaxed_load<atomic_load_az_8> (am_indexed8 GPR64sp:$Rn,
+ uimm12s1:$offset)),
(LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
-def : Pat<(relaxed_load<atomic_load_8>
+def : Pat<(relaxed_load<atomic_load_az_8>
(am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
// 16-bit loads
-def : Pat<(seq_cst_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
-def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+def : Pat<(seq_cst_load<atomic_load_az_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
+def : Pat<(acquiring_load<atomic_load_az_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_az_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
ro_Wextend16:$extend)),
(LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
-def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
- ro_Xextend16:$extend)),
+def : Pat<(relaxed_load<atomic_load_az_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+ ro_Xextend16:$extend)),
(LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
-def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn,
- uimm12s2:$offset)),
+def : Pat<(relaxed_load<atomic_load_az_16> (am_indexed16 GPR64sp:$Rn,
+ uimm12s2:$offset)),
(LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
-def : Pat<(relaxed_load<atomic_load_16>
+def : Pat<(relaxed_load<atomic_load_az_16>
(am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
(LDURHHi GPR64sp:$Rn, simm9:$offset)>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 78bc1b8c6f02..02fa36a1df4b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1505,7 +1505,7 @@ class CRmSystemI<Operand crmtype, bits<3> opc, string asm,
class SystemNoOperands<bits<3> op2, string asm, list<dag> pattern = []>
: SimpleSystemI<0, (ins), asm, "", pattern>,
- Sched<[]> {
+ Sched<[WriteHint]> {
bits<4> CRm;
let CRm = 0b0011;
let Inst{31-12} = 0b11010101000000110010;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 3802a45ad6c1..d444223e4494 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4356,10 +4356,12 @@ defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;
// AArch64's FCVT instructions saturate when out of range.
multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, string INST> {
+ let Predicates = [HasFullFP16] in {
def : Pat<(v4i16 (to_int_sat v4f16:$Rn, i16)),
(!cast<Instruction>(INST # v4f16) v4f16:$Rn)>;
def : Pat<(v8i16 (to_int_sat v8f16:$Rn, i16)),
(!cast<Instruction>(INST # v8f16) v8f16:$Rn)>;
+ }
def : Pat<(v2i32 (to_int_sat v2f32:$Rn, i32)),
(!cast<Instruction>(INST # v2f32) v2f32:$Rn)>;
def : Pat<(v4i32 (to_int_sat v4f32:$Rn, i32)),
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 68ff1b78e84b..c66f9cfd9c22 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -778,7 +778,7 @@ let Predicates = [HasSVEorSME] in {
defm BRKB_PPmP : sve_int_break_m<0b101, "brkb", int_aarch64_sve_brkb>;
defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>;
- def PTEST_PP : sve_int_ptest<0b010000, "ptest">;
+ def PTEST_PP : sve_int_ptest<0b010000, "ptest", AArch64ptest>;
defm PFALSE : sve_int_pfalse<0b000000, "pfalse">;
defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>;
defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>;
@@ -1531,6 +1531,14 @@ let Predicates = [HasSVEorSME] in {
def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
(PUNPKHI_PP PPR:$Ps)>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
+ (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 1))),
+ (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
+ (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 3))),
+ (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>;
def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
(PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>;
def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 2))),
@@ -1539,7 +1547,6 @@ let Predicates = [HasSVEorSME] in {
(PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>;
def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 6))),
(PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>;
-
def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
(PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>;
def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))),
@@ -1549,6 +1556,23 @@ let Predicates = [HasSVEorSME] in {
def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))),
(PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>;
+
+ def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
+ (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 1))),
+ (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 2))),
+ (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 3))),
+ (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
+ (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 5))),
+ (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 6))),
+ (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 7))),
+ (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>;
def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
(PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>;
def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 2))),
@@ -1566,6 +1590,39 @@ let Predicates = [HasSVEorSME] in {
def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 14))),
(PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
+ (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 1))),
+ (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 2))),
+ (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 3))),
+ (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))),
+ (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 5))),
+ (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 6))),
+ (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 7))),
+ (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
+ (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 9))),
+ (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 10))),
+ (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 11))),
+ (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))),
+ (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 13))),
+ (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 14))),
+ (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>;
+ def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 15))),
+ (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>;
+
// Extract subvectors from FP SVE vectors
def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))),
(UUNPKLO_ZZ_D ZPR:$Zs)>;
@@ -2074,15 +2131,6 @@ let Predicates = [HasSVEorSME] in {
def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
}
- def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)),
- (PTEST_PP PPR:$pg, PPR:$src)>;
- def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)),
- (PTEST_PP PPR:$pg, PPR:$src)>;
- def : Pat<(AArch64ptest (nxv4i1 PPR:$pg), (nxv4i1 PPR:$src)),
- (PTEST_PP PPR:$pg, PPR:$src)>;
- def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)),
- (PTEST_PP PPR:$pg, PPR:$src)>;
-
let AddedComplexity = 1 in {
class LD1RPat<ValueType vt, SDPatternOperator operator,
Instruction load, Instruction ptrue, ValueType index_vt, ComplexPattern CP, Operand immtype> :
@@ -2347,6 +2395,9 @@ let Predicates = [HasSVEorSME] in {
(AND_PPzPP (PTRUE_S 31), PPR:$Ps1, PPR:$Ps2)>;
def : Pat<(nxv2i1 (and PPR:$Ps1, PPR:$Ps2)),
(AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>;
+ // Emulate .Q operation using a PTRUE_D when the other lanes don't matter.
+ def : Pat<(nxv1i1 (and PPR:$Ps1, PPR:$Ps2)),
+ (AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>;
// Add more complex addressing modes here as required
multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load,
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td
index d18a05fda191..e378b043d37e 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA53.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -28,7 +28,8 @@ def CortexA53Model : SchedMachineModel {
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
PAUnsupported.F,
- SMEUnsupported.F);
+ SMEUnsupported.F,
+ [HasMTE]);
}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td
index c6b112d0d2f1..141cc6b79c8b 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA55.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -29,7 +29,7 @@ def CortexA55Model : SchedMachineModel {
let PostRAScheduler = 1; // Enable PostRA scheduler pass.
let CompleteModel = 0; // Covers instructions applicable to Cortex-A55.
- list<Predicate> UnsupportedFeatures = [HasSVE];
+ list<Predicate> UnsupportedFeatures = [HasSVE, HasMTE];
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td
index a860aa907fd1..8ce229374000 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -33,7 +33,8 @@ def CortexA57Model : SchedMachineModel {
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
PAUnsupported.F,
- SMEUnsupported.F);
+ SMEUnsupported.F,
+ [HasMTE]);
}
//===----------------------------------------------------------------------===//
@@ -459,9 +460,9 @@ def : InstRW<[A57Write_5cyc_2V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCM
// ASIMD FP convert, long and narrow
def : InstRW<[A57Write_8cyc_3V], (instregex "^FCVT(L|N|XN)v")>;
// ASIMD FP convert, other, D-form
-def : InstRW<[A57Write_5cyc_1V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[A57Write_5cyc_1V], (instregex "^[FSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
// ASIMD FP convert, other, Q-form
-def : InstRW<[A57Write_5cyc_2V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+def : InstRW<[A57Write_5cyc_2V], (instregex "^[FSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
// ASIMD FP divide, D-form, F32
def : InstRW<[A57Write_17cyc_1W], (instregex "FDIVv2f32")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
index 6b053f1969b4..4c65b6727d93 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -18,11 +18,11 @@ def A64FXModel : SchedMachineModel {
// Determined via a mix of micro-arch details and experimentation.
let LoopMicroOpBufferSize = 128;
let PostRAScheduler = 1; // Using PostRA sched.
- let CompleteModel = 1;
+ let CompleteModel = 0;
list<Predicate> UnsupportedFeatures =
[HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth,
- HasSVE2orSME];
+ HasSVE2orSME, HasMTE, HasMatMulInt8, HasBF16];
let FullInstRWOverlapCheck = 0;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
index 32f7299fbf87..b8d5a70d7ec6 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td
@@ -25,7 +25,9 @@ def Ampere1Model : SchedMachineModel {
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- SMEUnsupported.F);
+ SMEUnsupported.F,
+ PAUnsupported.F,
+ [HasMTE]);
}
let SchedModel = Ampere1Model in {
diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
index 9fbb46919427..e2d916954060 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -20,7 +20,8 @@ def CycloneModel : SchedMachineModel {
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
PAUnsupported.F,
- SMEUnsupported.F);
+ SMEUnsupported.F,
+ [HasMTE]);
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
index d66efb82fccc..f2863f5a8e3b 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -26,7 +26,8 @@ def ExynosM3Model : SchedMachineModel {
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
PAUnsupported.F,
- SMEUnsupported.F);
+ SMEUnsupported.F,
+ [HasMTE]);
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
index 94e70793e855..ab1e680f9e99 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -26,7 +26,8 @@ def ExynosM4Model : SchedMachineModel {
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
PAUnsupported.F,
- SMEUnsupported.F);
+ SMEUnsupported.F,
+ [HasMTE]);
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
index 1db5f5322a64..ae0b2b3eaeb6 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
@@ -26,7 +26,8 @@ def ExynosM5Model : SchedMachineModel {
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
PAUnsupported.F,
- SMEUnsupported.F);
+ SMEUnsupported.F,
+ [HasMTE]);
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
index 7c9b0afdd169..a765cd1cdfe3 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -25,7 +25,8 @@ def FalkorModel : SchedMachineModel {
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
PAUnsupported.F,
- SMEUnsupported.F);
+ SMEUnsupported.F,
+ [HasMTE]);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/llvm/lib/Target/AArch64/AArch64SchedKryo.td
index cc568a2f2f17..3551066ee7c3 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedKryo.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedKryo.td
@@ -29,7 +29,8 @@ def KryoModel : SchedMachineModel {
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
PAUnsupported.F,
- SMEUnsupported.F);
+ SMEUnsupported.F,
+ [HasMTE]);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
new file mode 100644
index 000000000000..eb5b971d66e5
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -0,0 +1,2279 @@
+//=- AArch64SchedNeoverseN2.td - NeoverseN2 Scheduling Defs --*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for the Arm Neoverse N2 processors.
+//
+//===----------------------------------------------------------------------===//
+
+def NeoverseN2Model : SchedMachineModel {
+ let IssueWidth = 10; // Micro-ops dispatched at a time.
+ let MicroOpBufferSize = 160; // Entries in micro-op re-order buffer.
+ let LoadLatency = 4; // Optimistic load latency.
+ let MispredictPenalty = 10; // Extra cycles for mispredicted branch.
+ let LoopMicroOpBufferSize = 16; // NOTE: Copied from Cortex-A57.
+ let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = SMEUnsupported.F;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Neoverse N2.
+// Instructions are first fetched and then decoded into internal macro-ops
+// (MOPs). From there, the MOPs proceed through register renaming and dispatch
+// stages. A MOP can be split into two micro-ops further down the pipeline
+// after the decode stage. Once dispatched, micro-ops wait for their operands
+// and issue out-of-order to one of thirteen issue pipelines. Each issue
+// pipeline can accept one micro-op per cycle.
+
+let SchedModel = NeoverseN2Model in {
+
+// Define the (13) issue ports.
+def N2UnitB : ProcResource<2>; // Branch 0/1
+def N2UnitS : ProcResource<2>; // Integer single Cycle 0/1
+def N2UnitM0 : ProcResource<1>; // Integer multicycle 0
+def N2UnitM1 : ProcResource<1>; // Integer multicycle 1
+def N2UnitL01 : ProcResource<2>; // Load/Store 0/1
+def N2UnitL2 : ProcResource<1>; // Load 2
+def N2UnitD : ProcResource<2>; // Store data 0/1
+def N2UnitV0 : ProcResource<1>; // FP/ASIMD 0
+def N2UnitV1 : ProcResource<1>; // FP/ASIMD 1
+
+def N2UnitV : ProcResGroup<[N2UnitV0, N2UnitV1]>; // FP/ASIMD 0/1
+def N2UnitM : ProcResGroup<[N2UnitM0, N2UnitM1]>; // Integer single/multicycle 0/1
+def N2UnitL : ProcResGroup<[N2UnitL01, N2UnitL2]>; // Load/Store 0/1 and Load 2
+def N2UnitI : ProcResGroup<[N2UnitS, N2UnitM0, N2UnitM1]>; // Integer single cycle 0/1 and single/multicycle 0/1
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadST, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+def : WriteRes<WriteLDHi, []> { let Latency = 4; }
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Neoverse N2.
+
+//===----------------------------------------------------------------------===//
+// Define generic 1 micro-op types
+
+def N2Write_1cyc_1B : SchedWriteRes<[N2UnitB]> { let Latency = 1; }
+def N2Write_1cyc_1I : SchedWriteRes<[N2UnitI]> { let Latency = 1; }
+def N2Write_1cyc_1M : SchedWriteRes<[N2UnitM]> { let Latency = 1; }
+def N2Write_1cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 1; }
+def N2Write_1cyc_1L01 : SchedWriteRes<[N2UnitL01]> { let Latency = 1; }
+def N2Write_2cyc_1M : SchedWriteRes<[N2UnitM]> { let Latency = 2; }
+def N2Write_3cyc_1M : SchedWriteRes<[N2UnitM]> { let Latency = 3; }
+def N2Write_2cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 2;
+ let ResourceCycles = [2]; }
+def N2Write_3cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 3;
+ let ResourceCycles = [3]; }
+def N2Write_5cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 5;
+ let ResourceCycles = [5]; }
+def N2Write_12cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 12;
+ let ResourceCycles = [12]; }
+def N2Write_20cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 20;
+ let ResourceCycles = [20]; }
+def N2Write_4cyc_1L : SchedWriteRes<[N2UnitL]> { let Latency = 4; }
+def N2Write_6cyc_1L : SchedWriteRes<[N2UnitL]> { let Latency = 6; }
+def N2Write_2cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 2; }
+def N2Write_3cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 3; }
+def N2Write_4cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 4; }
+def N2Write_5cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 5; }
+def N2Write_12cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 12; }
+def N2Write_2cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 2; }
+def N2Write_3cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 3; }
+def N2Write_4cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 4; }
+def N2Write_7cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 7;
+ let ResourceCycles = [7]; }
+def N2Write_9cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 9; }
+def N2Write_10cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 10; }
+def N2Write_12cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 12; }
+def N2Write_13cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 13; }
+def N2Write_15cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 15; }
+def N2Write_16cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 16; }
+def N2Write_20cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 20; }
+def N2Write_2cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 2; }
+def N2Write_3cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 3; }
+def N2Write_4cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 4; }
+def N2Write_6cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 6; }
+def N2Write_10cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 10; }
+def N2Write_6cyc_1L01 : SchedWriteRes<[N2UnitL01]> { let Latency = 6; }
+
+//===----------------------------------------------------------------------===//
+// Define generic 2 micro-op types
+
+def N2Write_1cyc_1B_1S : SchedWriteRes<[N2UnitB, N2UnitS]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def N2Write_6cyc_1M0_1B : SchedWriteRes<[N2UnitM0, N2UnitB]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def N2Write_9cyc_1M0_1L : SchedWriteRes<[N2UnitM0, N2UnitL]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+
+def N2Write_3cyc_1I_1M : SchedWriteRes<[N2UnitI, N2UnitM]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def N2Write_4cyc_1I_1L : SchedWriteRes<[N2UnitI, N2UnitL]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def N2Write_5cyc_1I_1L : SchedWriteRes<[N2UnitI, N2UnitL]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def N2Write_6cyc_1I_1L : SchedWriteRes<[N2UnitI, N2UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def N2Write_7cyc_1I_1L : SchedWriteRes<[N2UnitI, N2UnitL]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def N2Write_1cyc_1L01_1D : SchedWriteRes<[N2UnitL01, N2UnitD]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def N2Write_5cyc_1M0_1V : SchedWriteRes<[N2UnitM0, N2UnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def N2Write_2cyc_1L01_1V : SchedWriteRes<[N2UnitL01, N2UnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def N2Write_4cyc_1V1_1V : SchedWriteRes<[N2UnitV1, N2UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def N2Write_4cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def N2Write_10cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [5, 5];
+}
+
+def N2Write_13cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> {
+ let Latency = 13;
+ let NumMicroOps = 2;
+ let ResourceCycles = [6, 7];
+}
+
+def N2Write_15cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> {
+ let Latency = 15;
+ let NumMicroOps = 2;
+ let ResourceCycles = [7, 8];
+}
+
+def N2Write_16cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> {
+ let Latency = 16;
+ let NumMicroOps = 2;
+ let ResourceCycles = [8, 8];
+}
+
+def N2Write_4cyc_2V : SchedWriteRes<[N2UnitV, N2UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def N2Write_6cyc_2V : SchedWriteRes<[N2UnitV, N2UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def N2Write_6cyc_2L : SchedWriteRes<[N2UnitL, N2UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def N2Write_8cyc_1L_1V : SchedWriteRes<[N2UnitL, N2UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def N2Write_4cyc_1L01_1V : SchedWriteRes<[N2UnitL01, N2UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def N2Write_3cyc_1M0_1M : SchedWriteRes<[N2UnitM0, N2UnitM]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def N2Write_2cyc_1M0_1M : SchedWriteRes<[N2UnitM0, N2UnitM]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def N2Write_6cyc_2V1 : SchedWriteRes<[N2UnitV1, N2UnitV1]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def N2Write_4cyc_1V0_1M : SchedWriteRes<[N2UnitV0, N2UnitM]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def N2Write_5cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def N2Write_5cyc_1V1_1M0 : SchedWriteRes<[N2UnitV1, N2UnitM0]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def N2Write_7cyc_1M0_1V0 : SchedWriteRes<[N2UnitM0, N2UnitV0]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+def N2Write_2cyc_1V0_1M : SchedWriteRes<[N2UnitV0, N2UnitM]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def N2Write_6cyc_1V_1V1 : SchedWriteRes<[N2UnitV, N2UnitV1]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def N2Write_6cyc_1L_1M : SchedWriteRes<[N2UnitL, N2UnitM]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def N2Write_6cyc_1L_1S : SchedWriteRes<[N2UnitL, N2UnitS]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def N2Write_9cyc_1L_1V : SchedWriteRes<[N2UnitL, N2UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+
+def N2Write_4cyc_2V1 : SchedWriteRes<[N2UnitV1, N2UnitV1]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 3 micro-op types
+
+def N2Write_1cyc_1L01_1D_1I : SchedWriteRes<[N2UnitL01, N2UnitD, N2UnitI]> {
+ let Latency = 1;
+ let NumMicroOps = 3;
+}
+
+def N2Write_2cyc_1L01_1V_1I : SchedWriteRes<[N2UnitL01, N2UnitV, N2UnitI]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def N2Write_2cyc_1L01_2V : SchedWriteRes<[N2UnitL01, N2UnitV, N2UnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def N2Write_7cyc_1M_1M0_1V : SchedWriteRes<[N2UnitM, N2UnitM0, N2UnitV]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+
+def N2Write_8cyc_1M0_1V1_1V : SchedWriteRes<[N2UnitM0, N2UnitV1, N2UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+def N2Write_10cyc_1V_1L_1S : SchedWriteRes<[N2UnitV, N2UnitL, N2UnitL]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+
+def N2Write_2cyc_1L01_1S_1V : SchedWriteRes<[N2UnitL01, N2UnitS, N2UnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+def N2Write_4cyc_1L01_1S_1V : SchedWriteRes<[N2UnitL01, N2UnitS, N2UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+def N2Write_6cyc_3L : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def N2Write_8cyc_1L_2V : SchedWriteRes<[N2UnitL, N2UnitV, N2UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 4 micro-op types
+
+def N2Write_2cyc_1L01_2V_1I : SchedWriteRes<[N2UnitL01, N2UnitV, N2UnitV,
+ N2UnitI]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+
+def N2Write_6cyc_4V0 : SchedWriteRes<[N2UnitV0, N2UnitV0, N2UnitV0, N2UnitV0]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def N2Write_4cyc_4V : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+def N2Write_6cyc_4V : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def N2Write_8cyc_2L_2V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def N2Write_9cyc_2L_2V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+}
+
+def N2Write_2cyc_2L01_2V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitV,
+ N2UnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+
+def N2Write_4cyc_2L01_2V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitV,
+ N2UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+def N2Write_5cyc_2L01_2V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitV,
+ N2UnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+}
+
+def N2Write_8cyc_2M0_2V0 : SchedWriteRes<[N2UnitM0, N2UnitM0, N2UnitV0,
+ N2UnitV0]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def N2Write_11cyc_2V_2V1 : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV1,
+ N2UnitV1]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+}
+
+def N2Write_9cyc_2V_2V1 : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV1,
+ N2UnitV1]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+}
+
+def N2Write_8cyc_2V_2V1 : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV1,
+ N2UnitV1]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+}
+
+def N2Write_10cyc_2L_2V1 : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV1,
+ N2UnitV1]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+}
+
+def N2Write_10cyc_2L_2V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+}
+
+def N2Write_4cyc_2M0_2M : SchedWriteRes<[N2UnitM0, N2UnitM0, N2UnitM,
+ N2UnitM]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+def N2Write_6cyc_2I_2L : SchedWriteRes<[N2UnitI, N2UnitI, N2UnitL, N2UnitL]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def N2Write_7cyc_4L : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, N2UnitL]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 5 micro-op types
+
+def N2Write_2cyc_1L01_2V_2I : SchedWriteRes<[N2UnitL01, N2UnitV, N2UnitV,
+ N2UnitI, N2UnitI]> {
+ let Latency = 2;
+ let NumMicroOps = 5;
+}
+
+def N2Write_8cyc_2L_3V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV,
+ N2UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 6 micro-op types
+
+def N2Write_8cyc_3L_3V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL,
+ N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+}
+
+def N2Write_2cyc_3L01_3V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 6;
+}
+
+def N2Write_6cyc_3L01_3V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+}
+
+def N2Write_4cyc_3L01_3V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 4;
+ let NumMicroOps = 6;
+}
+
+def N2Write_10cyc_2L_2V_2S : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV,
+ N2UnitS, N2UnitS]> {
+ let Latency = 10;
+ let NumMicroOps = 6;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 7 micro-op types
+
+def N2Write_8cyc_3L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL,
+ N2UnitV, N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 7;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 8 micro-op types
+
+def N2Write_6cyc_8V : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV, N2UnitV,
+ N2UnitV, N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 6;
+ let NumMicroOps = 8;
+}
+
+def N2Write_2cyc_4L01_4V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitL01, N2UnitV, N2UnitV, N2UnitV,
+ N2UnitV]> {
+ let Latency = 2;
+ let NumMicroOps = 8;
+}
+
+def N2Write_5cyc_4L01_4V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitL01, N2UnitV, N2UnitV, N2UnitV,
+ N2UnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 8;
+}
+
+def N2Write_8cyc_4L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, N2UnitL,
+ N2UnitV, N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 8;
+ let NumMicroOps = 8;
+}
+
+def N2Write_9cyc_4L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, N2UnitL,
+ N2UnitV, N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 10 micro-op types
+
+def N2Write_7cyc_5L01_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitL01, N2UnitL01, N2UnitV,
+ N2UnitV, N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 7;
+ let NumMicroOps = 10;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 12 micro-op types
+
+def N2Write_7cyc_6L01_6V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitV, N2UnitV, N2UnitV, N2UnitV,
+ N2UnitV, N2UnitV]> {
+ let Latency = 7;
+ let NumMicroOps = 12;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 15 micro-op types
+
+def N2Write_7cyc_5L01_5S_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitL01, N2UnitL01, N2UnitS,
+ N2UnitS, N2UnitS, N2UnitS,
+ N2UnitS, N2UnitV, N2UnitV,
+ N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 7;
+ let NumMicroOps = 15;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 18 micro-op types
+
+def N2Write_11cyc_9L01_9V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitV, N2UnitV, N2UnitV,
+ N2UnitV, N2UnitV, N2UnitV,
+ N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 11;
+ let NumMicroOps = 18;
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 27 micro-op types
+
+def N2Write_11cyc_9L01_9S_9V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitL01, N2UnitL01, N2UnitL01,
+ N2UnitS, N2UnitS, N2UnitS,
+ N2UnitS, N2UnitS, N2UnitS,
+ N2UnitS, N2UnitS, N2UnitS,
+ N2UnitV, N2UnitV, N2UnitV,
+ N2UnitV, N2UnitV, N2UnitV,
+ N2UnitV, N2UnitV, N2UnitV]> {
+ let Latency = 11;
+ let NumMicroOps = 27;
+}
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// Branch Instructions
+// -----------------------------------------------------------------------------
+
+// Branch, immed
+// Compare and branch
+def : SchedAlias<WriteBr, N2Write_1cyc_1B>;
+
+// Branch, register
+def : SchedAlias<WriteBrReg, N2Write_1cyc_1B>;
+
+// Branch and link, immed
+// Branch and link, register
+def : InstRW<[N2Write_1cyc_1B_1S], (instrs BL, BLR)>;
+
+// Arithmetic and Logical Instructions
+// -----------------------------------------------------------------------------
+
+// ALU, basic
+// ALU, basic, flagset
+def : SchedAlias<WriteI, N2Write_1cyc_1I>;
+
+// ALU, extend and shift
+def : SchedAlias<WriteISReg, N2Write_2cyc_1M>;
+def : SchedAlias<WriteIEReg, N2Write_2cyc_1M>;
+
+// Arithmetic, immediate to logical address tag
+def : InstRW<[N2Write_2cyc_1M], (instrs ADDG, SUBG)>;
+
+// Convert floating-point condition flags
+// Flag manipulation instructions
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+
+// Insert Random Tags
+def : InstRW<[N2Write_2cyc_1M], (instrs IRG, IRGstack)>;
+
+// Insert Tag Mask
+// Subtract Pointer
+// Subtract Pointer, flagset
+def : InstRW<[N2Write_1cyc_1I], (instrs GMI, SUBP, SUBPS)>;
+
+// Move and shift instructions
+// -----------------------------------------------------------------------------
+
+def : SchedAlias<WriteImm, N2Write_1cyc_1I>;
+
+// Divide and Multiply Instructions
+// -----------------------------------------------------------------------------
+
+// SDIV, UDIV
+def : SchedAlias<WriteID32, N2Write_12cyc_1M0>;
+def : SchedAlias<WriteID64, N2Write_20cyc_1M0>;
+
+def : WriteRes<WriteIM32, [N2UnitM]> { let Latency = 2; }
+def : WriteRes<WriteIM64, [N2UnitM]> { let Latency = 2; }
+
+// Multiply high
+def : InstRW<[N2Write_3cyc_1M], (instrs SMULHrr, UMULHrr)>;
+
+// Pointer Authentication Instructions (v8.3 PAC)
+// -----------------------------------------------------------------------------
+
+// Authenticate data address
+// Authenticate instruction address
+// Compute pointer authentication code for data address
+// Compute pointer authentication code, using generic key
+// Compute pointer authentication code for instruction address
+def : InstRW<[N2Write_5cyc_1M0], (instregex "^AUT", "^PAC")>;
+
+// Branch and link, register, with pointer authentication
+// Branch, register, with pointer authentication
+// Branch, return, with pointer authentication
+def : InstRW<[N2Write_6cyc_1M0_1B], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA,
+ BRAAZ, BRAB, BRABZ, RETAA, RETAB,
+ ERETAA, ERETAB)>;
+
+
+// Load register, with pointer authentication
+def : InstRW<[N2Write_9cyc_1M0_1L], (instregex "^LDRA[AB](indexed|writeback)")>;
+
+// Strip pointer authentication code
+def : InstRW<[N2Write_2cyc_1M0], (instrs XPACD, XPACI, XPACLRI)>;
+
+// Miscellaneous data-processing instructions
+// -----------------------------------------------------------------------------
+
+// Bitfield extract, one reg
+// Bitfield extract, two regs
+// NOTE: We don't model the difference between EXTR where both operands are the
+// same (one reg).
+def : SchedAlias<WriteExtr, N2Write_3cyc_1I_1M>;
+def : InstRW<[N2Write_3cyc_1I_1M], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitfield move, basic
+def : SchedAlias<WriteIS, N2Write_1cyc_1I>;
+
+// Bitfield move, insert
+def : InstRW<[N2Write_2cyc_1M], (instregex "^BFM[WX]ri$")>;
+
+// Load instructions
+// -----------------------------------------------------------------------------
+
+def : SchedAlias<WriteLD, N2Write_4cyc_1L>;
+def : SchedAlias<WriteLDIdx, N2Write_4cyc_1I_1L>;
+
+// Load pair, signed immed offset, signed words
+def : InstRW<[N2Write_5cyc_1M0, WriteLDHi], (instrs LDPSWi)>;
+// Load pair, immed post-index or immed pre-index, signed words
+def : InstRW<[N2Write_5cyc_1M0, WriteLDHi, WriteAdr],
+ (instregex "^LDPSW(post|pre)$")>;
+
+// Store instructions
+// -----------------------------------------------------------------------------
+
+def : SchedAlias<WriteST, N2Write_1cyc_1L01_1D>;
+def : SchedAlias<WriteSTIdx, N2Write_1cyc_1L01_1D_1I>;
+def : SchedAlias<WriteSTP, N2Write_1cyc_1L01_1D>;
+def : SchedAlias<WriteAdr, N2Write_1cyc_1I>; // copied from A57.
+
+// Tag load instructions
+// -----------------------------------------------------------------------------
+
+// Load allocation tag
+// Load multiple allocation tags
+def : InstRW<[N2Write_4cyc_1L], (instrs LDG, LDGM)>;
+
+// Tag store instructions
+// -----------------------------------------------------------------------------
+
+// Store allocation tags to one or two granules, post-index
+// Store allocation tags to one or two granules, pre-index
+// Store allocation tag to one or two granules, zeroing, post-index
+// Store Allocation Tag to one or two granules, zeroing, pre-index
+// Store allocation tag and reg pair to memory, post-Index
+// Store allocation tag and reg pair to memory, pre-Index
+def : InstRW<[N2Write_1cyc_1L01_1D_1I], (instrs STGPreIndex, STGPostIndex,
+ ST2GPreIndex, ST2GPostIndex,
+ STZGPreIndex, STZGPostIndex,
+ STZ2GPreIndex, STZ2GPostIndex,
+ STGPpre, STGPpost)>;
+
+// Store allocation tags to one or two granules, signed offset
+// Store allocation tag to two granules, zeroing, signed offset
+// Store allocation tag and reg pair to memory, signed offset
+// Store multiple allocation tags
+def : InstRW<[N2Write_1cyc_1L01_1D], (instrs STGOffset, ST2GOffset, STZGOffset,
+ STZ2GOffset, STGPi, STGM, STZGM)>;
+
+// FP data processing instructions
+// -----------------------------------------------------------------------------
+
+// FP absolute value
+// FP arithmetic
+// FP min/max
+// FP negate
+// FP select
+def : SchedAlias<WriteF, N2Write_2cyc_1V>;
+
+// FP compare
+def : SchedAlias<WriteFCmp, N2Write_2cyc_1V0>;
+
+// FP divide, square root
+def : SchedAlias<WriteFDiv, N2Write_7cyc_1V0>;
+
+// FP divide, H-form
+def : InstRW<[N2Write_7cyc_1V0], (instrs FDIVHrr)>;
+// FP divide, S-form
+def : InstRW<[N2Write_10cyc_1V0], (instrs FDIVSrr)>;
+// FP divide, D-form
+def : InstRW<[N2Write_15cyc_1V0], (instrs FDIVDrr)>;
+
+// FP square root, H-form
+def : InstRW<[N2Write_7cyc_1V0], (instrs FSQRTHr)>;
+// FP square root, S-form
+def : InstRW<[N2Write_9cyc_1V0], (instrs FSQRTSr)>;
+// FP square root, D-form
+def : InstRW<[N2Write_16cyc_1V0], (instrs FSQRTDr)>;
+
+// FP multiply
+def : WriteRes<WriteFMul, [N2UnitV]> { let Latency = 3; }
+
+// FP multiply accumulate
+def : InstRW<[N2Write_4cyc_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+
+// FP round to integral
+def : InstRW<[N2Write_3cyc_1V0], (instregex "^FRINT[AIMNPXZ][HSD]r$",
+ "^FRINT(32|64)[XZ][SD]r$")>;
+
+// FP miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// FP convert, from gen to vec reg
+def : InstRW<[N2Write_3cyc_1M0], (instregex "^[SU]CVTF[SU][WX][HSD]ri$")>;
+
+// FP convert, from vec to gen reg
+def : InstRW<[N2Write_3cyc_1V], (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]r$")>;
+
+// FP convert, Javascript from vec to gen reg
+// FP convert, from vec to vec reg
+def : SchedAlias<WriteFCvt, N2Write_3cyc_1V0>;
+
+// FP move, immed
+// FP move, register
+def : SchedAlias<WriteFImm, N2Write_2cyc_1V>;
+
+// FP transfer, from gen to low half of vec reg
+def : InstRW<[N2Write_3cyc_1M0], (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr,
+ FMOVHWr, FMOVHXr, FMOVSWr, FMOVDXr)>;
+
+// FP transfer, from gen to high half of vec reg
+def : InstRW<[N2Write_5cyc_1M0_1V], (instrs FMOVXDHighr)>;
+
+// FP transfer, from vec to gen reg
+def : SchedAlias<WriteFCopy, N2Write_2cyc_1V>;
+
+// FP load instructions
+// -----------------------------------------------------------------------------
+
+// Load vector reg, literal, S/D/Q forms
+// Load vector reg, unscaled immed
+def : InstRW<[N2Write_6cyc_1L], (instregex "^LDR[SDQ]l$",
+ "^LDUR[BHSDQ]i$")>;
+
+// Load vector reg, immed post-index
+def : InstRW<[N2Write_6cyc_1I_1L, WriteI], (instregex "^LDR[BHSDQ]post$")>;
+// Load vector reg, immed pre-index
+def : InstRW<[N2Write_6cyc_1I_1L, WriteAdr], (instregex "^LDR[BHSDQ]pre$")>;
+
+// Load vector reg, unsigned immed
+def : InstRW<[N2Write_6cyc_1L], (instregex "^LDR[BHSDQ]ui$")>;
+
+// Load vector reg, register offset, basic
+// Load vector reg, register offset, scale, S/D-form
+// Load vector reg, register offset, extend
+// Load vector reg, register offset, extend, scale, S/D-form
+def : InstRW<[N2Write_6cyc_1L, ReadAdrBase], (instregex "^LDR[BSD]ro[WX]$")>;
+
+// Load vector reg, register offset, scale, H/Q-form
+// Load vector reg, register offset, extend, scale, H/Q-form
+def : InstRW<[N2Write_7cyc_1I_1L, ReadAdrBase], (instregex "^LDR[HQ]ro[WX]$")>;
+
+// Load vector pair, immed offset, S/D-form
+def : InstRW<[N2Write_6cyc_1L, WriteLDHi], (instregex "^LDN?P[SD]i$")>;
+
+// Load vector pair, immed offset, Q-form
+def : InstRW<[N2Write_6cyc_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>;
+
+// Load vector pair, immed post-index, S/D-form
+// Load vector pair, immed pre-index, S/D-form
+def : InstRW<[N2Write_6cyc_1I_1L, WriteLDHi, WriteAdr],
+ (instregex "^LDP[SD](pre|post)$")>;
+
+// Load vector pair, immed post-index, Q-form
+// Load vector pair, immed pre-index, Q-form
+def : InstRW<[N2Write_6cyc_2I_2L, WriteLDHi, WriteAdr], (instrs LDPQpost,
+ LDPQpre)>;
+
+// FP store instructions
+// -----------------------------------------------------------------------------
+
+// Store vector reg, unscaled immed, B/H/S/D-form
+// Store vector reg, unscaled immed, Q-form
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STUR[BHSDQ]i$")>;
+
+// Store vector reg, immed post-index, B/H/S/D-form
+// Store vector reg, immed post-index, Q-form
+// Store vector reg, immed pre-index, B/H/S/D-form
+// Store vector reg, immed pre-index, Q-form
+def : InstRW<[WriteAdr, N2Write_2cyc_1L01_1V_1I, ReadAdrBase],
+ (instregex "^STR[BHSDQ](pre|post)$")>;
+
+// Store vector reg, unsigned immed, B/H/S/D-form
+// Store vector reg, unsigned immed, Q-form
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STR[BHSDQ]ui$")>;
+
+// Store vector reg, register offset, basic, B/H/S/D-form
+// Store vector reg, register offset, basic, Q-form
+// Store vector reg, register offset, scale, S/D-form
+// Store vector reg, register offset, extend, B/H/S/D-form
+// Store vector reg, register offset, extend, Q-form
+// Store vector reg, register offset, extend, scale, S/D-form
+def : InstRW<[N2Write_2cyc_1L01_1V, ReadAdrBase],
+ (instregex "^STR[BSD]ro[WX]$")>;
+
+// Store vector reg, register offset, scale, H-form
+// Store vector reg, register offset, scale, Q-form
+// Store vector reg, register offset, extend, scale, H-form
+// Store vector reg, register offset, extend, scale, Q-form
+def : InstRW<[N2Write_2cyc_1L01_1V, ReadAdrBase],
+ (instregex "^STR[HQ]ro[WX]$")>;
+
+// Store vector pair, immed offset, S-form
+// Store vector pair, immed offset, D-form
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STN?P[SD]i$")>;
+
+// Store vector pair, immed offset, Q-form
+def : InstRW<[N2Write_2cyc_1L01_2V], (instrs STPQi, STNPQi)>;
+
+// Store vector pair, immed post-index, S-form
+// Store vector pair, immed post-index, D-form
+// Store vector pair, immed pre-index, S-form
+// Store vector pair, immed pre-index, D-form
+def : InstRW<[WriteAdr, N2Write_2cyc_1L01_1V_1I],
+ (instregex "^STP[SD](pre|post)$")>;
+
+// Store vector pair, immed post-index, Q-form
+def : InstRW<[N2Write_2cyc_1L01_2V_1I], (instrs STPQpost)>;
+
+// Store vector pair, immed pre-index, Q-form
+def : InstRW<[N2Write_2cyc_1L01_2V_2I], (instrs STPQpre)>;
+
+// ASIMD integer instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD absolute diff
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD arith, pair-wise
+// ASIMD compare
+// ASIMD logical
+// ASIMD max/min, basic and pair-wise
+def : SchedAlias<WriteVd, N2Write_2cyc_1V>;
+def : SchedAlias<WriteVq, N2Write_2cyc_1V>;
+
+// ASIMD absolute diff accum
+// ASIMD absolute diff accum long
+def : InstRW<[N2Write_4cyc_1V1],
+ (instregex "^SABAv", "^UABAv", "^SABALv", "^UABALv")>;
+
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[N2Write_2cyc_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
+
+// ASIMD arith, reduce, 8B/8H
+def : InstRW<[N2Write_4cyc_1V1_1V],
+ (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>;
+
+// ASIMD arith, reduce, 16B
+def : InstRW<[N2Write_4cyc_1V1], (instrs ADDVv16i8v, SADDLVv16i8v,
+ UADDLVv16i8v)>;
+
+// ASIMD dot product
+// ASIMD dot product using signed and unsigned integers
+def : InstRW<[N2Write_3cyc_1V],
+ (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
+
+// ASIMD matrix multiply-accumulate
+def : InstRW<[N2Write_3cyc_1V], (instrs SMMLA, UMMLA, USMMLA)>;
+
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[N2Write_2cyc_1V1], (instregex "^[SU](MAX|MIN)Vv4i16v$",
+ "^[SU](MAX|MIN)Vv4i32v$")>;
+
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[N2Write_4cyc_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$",
+ "^[SU](MAX|MIN)Vv8i16v$")>;
+
+// ASIMD max/min, reduce, 16B
+def : InstRW<[N2Write_4cyc_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
+
+// ASIMD multiply
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>;
+
+// ASIMD multiply accumulate
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^MLAv", "^MLSv")>;
+
+// ASIMD multiply accumulate high
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
+
+// ASIMD multiply accumulate long
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
+
+// ASIMD multiply accumulate saturating long
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQDMLALv", "^SQDMLSLv")>;
+
+// ASIMD multiply/multiply long (8x8) polynomial, D-form
+// ASIMD multiply/multiply long (8x8) polynomial, Q-form
+def : InstRW<[N2Write_3cyc_1V0], (instregex "^PMULL?(v8i8|v16i8)$")>;
+
+// ASIMD multiply long
+def : InstRW<[N2Write_3cyc_1V], (instregex "^[SU]MULLv", "^SQDMULLv")>;
+
+// ASIMD pairwise add and accumulate long
+def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]ADALPv")>;
+
+// ASIMD shift accumulate
+def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]SRAv", "^[SU]RSRAv")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[N2Write_2cyc_1V1], (instregex "^SHLv", "^SHLLv", "^SHRNv",
+ "^SSHLLv", "^SSHRv", "^USHLLv",
+ "^USHRv")>;
+
+// ASIMD shift by immed and insert, basic
+def : InstRW<[N2Write_2cyc_1V1], (instregex "^SLIv", "^SRIv")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[N2Write_4cyc_1V1],
+ (instregex "^RSHRNv", "^SQRSHRNv", "^SQRSHRUNv",
+ "^(SQSHLU?|UQSHL)[bhsd]$",
+ "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
+ "^SQSHRNv", "^SQSHRUNv", "^SRSHRv", "^UQRSHRNv",
+ "^UQSHRNv", "^URSHRv")>;
+
+// ASIMD shift by register, basic
+def : InstRW<[N2Write_2cyc_1V1], (instregex "^[SU]SHLv")>;
+
+// ASIMD shift by register, complex
+def : InstRW<[N2Write_4cyc_1V1],
+ (instregex "^[SU]RSHLv", "^[SU]QRSHLv",
+ "^[SU]QSHL(v1i8|v1i16|v1i32|v1i64|v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)$")>;
+
+// ASIMD floating-point instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD FP absolute value/difference
+// ASIMD FP arith, normal
+// ASIMD FP compare
+// ASIMD FP complex add
+// ASIMD FP max/min, normal
+// ASIMD FP max/min, pairwise
+// ASIMD FP negate
+// Handled by SchedAlias<WriteV[dq], ...>
+
+// ASIMD FP complex multiply add
+def : InstRW<[N2Write_4cyc_1V], (instregex "^FCMLAv")>;
+
+// ASIMD FP convert, long (F16 to F32)
+def : InstRW<[N2Write_4cyc_2V0], (instregex "^FCVTL(v4|v8)i16")>;
+
+// ASIMD FP convert, long (F32 to F64)
+def : InstRW<[N2Write_3cyc_1V0], (instregex "^FCVTL(v2|v4)i32")>;
+
+// ASIMD FP convert, narrow (F32 to F16)
+def : InstRW<[N2Write_4cyc_2V0], (instregex "^FCVTN(v4|v8)i16")>;
+
+// ASIMD FP convert, narrow (F64 to F32)
+def : InstRW<[N2Write_3cyc_1V0], (instregex "^FCVTN(v2|v4)i32",
+ "^FCVTXN(v2|v4)f32")>;
+
+// ASIMD FP convert, other, D-form F32 and Q-form F64
+def : InstRW<[N2Write_3cyc_1V0], (instregex "^[FSU]CVT[AMNPZ][SU]v2f(32|64)$",
+ "^[SU]CVTFv2f(32|64)$")>;
+
+// ASIMD FP convert, other, D-form F16 and Q-form F32
+def : InstRW<[N2Write_4cyc_2V0], (instregex "^[FSU]CVT[AMNPZ][SU]v4f(16|32)$",
+ "^[SU]CVTFv4f(16|32)$")>;
+
+// ASIMD FP convert, other, Q-form F16
+def : InstRW<[N2Write_6cyc_4V0], (instregex "^[FSU]CVT[AMNPZ][SU]v8f16$",
+ "^[SU]CVTFv8f16$")>;
+
+// ASIMD FP divide, D-form, F16
+def : InstRW<[N2Write_7cyc_1V0], (instrs FDIVv4f16)>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[N2Write_10cyc_2V0], (instrs FDIVv2f32)>;
+
+// ASIMD FP divide, Q-form, F16
+def : InstRW<[N2Write_13cyc_2V0], (instrs FDIVv8f16)>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[N2Write_10cyc_2V0], (instrs FDIVv4f32)>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[N2Write_15cyc_2V0], (instrs FDIVv2f64)>;
+
+// ASIMD FP max/min, reduce, F32 and D-form F16
+def : InstRW<[N2Write_4cyc_1V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>;
+
+// ASIMD FP max/min, reduce, Q-form F16
+def : InstRW<[N2Write_6cyc_2V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>;
+
+// ASIMD FP multiply
+def : InstRW<[N2Write_3cyc_1V], (instregex "^FMULv", "^FMULXv")>;
+
+// ASIMD FP multiply accumulate
+def : InstRW<[N2Write_4cyc_1V], (instregex "^FMLAv", "^FMLSv")>;
+
+// ASIMD FP multiply accumulate long
+def : InstRW<[N2Write_5cyc_1V], (instregex "^FMLALv", "^FMLSLv")>;
+
+// ASIMD FP round, D-form F32 and Q-form F64
+def : InstRW<[N2Write_3cyc_1V0],
+ (instregex "^FRINT[AIMNPXZ]v2f(32|64)$",
+ "^FRINT[32|64)[XZ]v2f(32|64)$")>;
+
+// ASIMD FP round, D-form F16 and Q-form F32
+def : InstRW<[N2Write_4cyc_2V0],
+ (instregex "^FRINT[AIMNPXZ]v4f(16|32)$",
+ "^FRINT(32|64)[XZ]v4f32$")>;
+
+
+// ASIMD FP round, Q-form F16
+def : InstRW<[N2Write_6cyc_4V0], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
+
+// ASIMD FP square root, D-form, F16
+def : InstRW<[N2Write_7cyc_1V0], (instrs FSQRTv4f16)>;
+
+// ASIMD FP square root, D-form, F32
+def : InstRW<[N2Write_10cyc_2V0], (instrs FSQRTv2f32)>;
+
+// ASIMD FP square root, Q-form, F16
+def : InstRW<[N2Write_13cyc_2V0], (instrs FSQRTv8f16)>;
+
+// ASIMD FP square root, Q-form, F32
+def : InstRW<[N2Write_10cyc_2V0], (instrs FSQRTv4f32)>;
+
+// ASIMD FP square root, Q-form, F64
+def : InstRW<[N2Write_16cyc_2V0], (instrs FSQRTv2f64)>;
+
+// ASIMD BFloat16 (BF16) instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD convert, F32 to BF16
+def : InstRW<[N2Write_4cyc_1V0], (instrs BFCVTN, BFCVTN2)>;
+
+// ASIMD dot product
+def : InstRW<[N2Write_4cyc_1V], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
+
+// ASIMD matrix multiply accumulate
+def : InstRW<[N2Write_5cyc_1V], (instrs BFMMLA)>;
+
+// ASIMD multiply accumulate long
+def : InstRW<[N2Write_4cyc_1V], (instrs BFMLALB, BFMLALBIdx, BFMLALT,
+ BFMLALTIdx)>;
+
+// Scalar convert, F32 to BF16
+def : InstRW<[N2Write_3cyc_1V0], (instrs BFCVT)>;
+
+// ASIMD miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD bit reverse
+// ASIMD bitwise insert
+// ASIMD count
+// ASIMD duplicate, element
+// ASIMD extract
+// ASIMD extract narrow
+// ASIMD insert, element to element
+// ASIMD move, FP immed
+// ASIMD move, integer immed
+// ASIMD reverse
+// ASIMD table lookup, 1 or 2 table regs
+// ASIMD table lookup extension, 1 table reg
+// ASIMD transfer, element to gen reg
+// ASIMD transpose
+// ASIMD unzip/zip
+// Handled by SchedAlias<WriteV[dq], ...>
+
+// ASIMD duplicate, gen reg
+def : InstRW<[N2Write_3cyc_1M0], (instregex "^DUPv.+gpr")>;
+
+// ASIMD extract narrow, saturating
+def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]QXTNv", "^SQXTUNv")>;
+
+// ASIMD reciprocal and square root estimate, D-form U32
+def : InstRW<[N2Write_3cyc_1V0], (instrs URECPEv2i32, URSQRTEv2i32)>;
+
+// ASIMD reciprocal and square root estimate, Q-form U32
+def : InstRW<[N2Write_4cyc_2V0], (instrs URECPEv4i32, URSQRTEv4i32)>;
+
+// ASIMD reciprocal and square root estimate, D-form F32 and scalar forms
+def : InstRW<[N2Write_3cyc_1V0], (instrs FRECPEv1f16, FRECPEv1i32,
+ FRECPEv1i64, FRECPEv2f32,
+ FRSQRTEv1f16, FRSQRTEv1i32,
+ FRSQRTEv1i64, FRSQRTEv2f32)>;
+
+// ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32
+def : InstRW<[N2Write_4cyc_2V0], (instrs FRECPEv4f16, FRECPEv4f32,
+ FRSQRTEv4f16, FRSQRTEv4f32)>;
+
+// ASIMD reciprocal and square root estimate, Q-form F16
+def : InstRW<[N2Write_6cyc_4V0], (instrs FRECPEv8f16, FRSQRTEv8f16)>;
+
+// ASIMD reciprocal exponent
+def : InstRW<[N2Write_3cyc_1V0], (instregex "^FRECPXv")>;
+
+// ASIMD reciprocal step
+def : InstRW<[N2Write_4cyc_1V], (instregex "^FRECPSv", "^FRSQRTSv")>;
+
+// ASIMD table lookup, 3 table regs
+def : InstRW<[N2Write_4cyc_2V], (instrs TBLv8i8Three, TBLv16i8Three)>;
+
+// ASIMD table lookup, 4 table regs
+def : InstRW<[N2Write_4cyc_4V], (instrs TBLv8i8Four, TBLv16i8Four)>;
+
+// ASIMD table lookup extension, 2 table reg
+def : InstRW<[N2Write_4cyc_2V], (instrs TBXv8i8Two, TBXv16i8Two)>;
+
+// ASIMD table lookup extension, 3 table reg
+def : InstRW<[N2Write_6cyc_4V], (instrs TBXv8i8Three, TBXv16i8Three)>;
+
+// ASIMD table lookup extension, 4 table reg
+def : InstRW<[N2Write_6cyc_8V], (instrs TBXv8i8Four, TBXv16i8Four)>;
+
+// ASIMD transfer, gen reg to element
+def : InstRW<[N2Write_5cyc_1M0_1V], (instregex "^INSv")>;
+
+// ASIMD load instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[N2Write_6cyc_1L, WriteAdr],
+ (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_6cyc_1L, WriteAdr],
+ (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+def : InstRW<[N2Write_6cyc_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[N2Write_6cyc_2L, WriteAdr],
+ (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[N2Write_6cyc_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_6cyc_2L, WriteAdr],
+ (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+def : InstRW<[N2Write_6cyc_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[N2Write_6cyc_3L, WriteAdr],
+ (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[N2Write_6cyc_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_6cyc_3L, WriteAdr],
+ (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+def : InstRW<[N2Write_7cyc_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[N2Write_7cyc_4L, WriteAdr],
+ (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[N2Write_7cyc_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_7cyc_4L, WriteAdr],
+ (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[N2Write_8cyc_1L_1V], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+def : InstRW<[N2Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[N2Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 2 element, multiple, Q-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[N2Write_8cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_8cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+def : InstRW<[N2Write_8cyc_2L_3V], (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+def : InstRW<[N2Write_8cyc_3L_3V], (instregex "LD3Threev(16b|8h|4s)$")>;
+def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
+
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[N2Write_8cyc_3L_3V], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+// ASIMD load, 3 element, one lane, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[N2Write_8cyc_2L_3V], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+def : InstRW<[N2Write_8cyc_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[N2Write_8cyc_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+def : InstRW<[N2Write_8cyc_3L_4V], (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[N2Write_9cyc_4L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_9cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[N2Write_8cyc_3L_4V], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+def : InstRW<[N2Write_8cyc_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[N2Write_8cyc_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_8cyc_4L_4V, WriteAdr], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store instructions
+// -----------------------------------------------------------------------------
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[N2Write_2cyc_2L01_2V], (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+def : InstRW<[N2Write_2cyc_2L01_2V], (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[N2Write_2cyc_3L01_3V], (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_2cyc_3L01_3V, WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+def : InstRW<[N2Write_2cyc_2L01_2V], (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[N2Write_2cyc_4L01_4V], (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_2cyc_4L01_4V, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_4cyc_2L01_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+def : InstRW<[N2Write_5cyc_2L01_2V], (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[N2Write_5cyc_2L01_2V, WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+def : InstRW<[N2Write_7cyc_6L01_6V], (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[N2Write_7cyc_6L01_6V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[N2Write_5cyc_4L01_4V], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[N2Write_5cyc_4L01_4V, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H/S
+def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST4i(8|16|32)_POST$")>;
+
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[N2Write_4cyc_3L01_3V], (instregex "ST4i(64)$")>;
+def : InstRW<[N2Write_4cyc_3L01_3V, WriteAdr], (instregex "ST4i(64)_POST$")>;
+
+// Cryptography extensions
+// -----------------------------------------------------------------------------
+
+// Crypto AES ops
+def : InstRW<[N2Write_2cyc_1V], (instregex "^AES[DE]rr$", "^AESI?MCrr")>;
+
+// Crypto polynomial (64x64) multiply long
+def : InstRW<[N2Write_2cyc_1V0], (instrs PMULLv1i64, PMULLv2i64)>;
+
+// Crypto SHA1 hash acceleration op
+// Crypto SHA1 schedule acceleration ops
+def : InstRW<[N2Write_2cyc_1V0], (instregex "^SHA1(H|SU0|SU1)")>;
+
+// Crypto SHA1 hash acceleration ops
+// Crypto SHA256 hash acceleration ops
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^SHA1[CMP]", "^SHA256H2?")>;
+
+// Crypto SHA256 schedule acceleration ops
+def : InstRW<[N2Write_2cyc_1V0], (instregex "^SHA256SU[01]")>;
+
+// Crypto SHA512 hash acceleration ops
+def : InstRW<[N2Write_2cyc_1V0], (instregex "^SHA512(H|H2|SU0|SU1)")>;
+
+// Crypto SHA3 ops
+def : InstRW<[N2Write_2cyc_1V0], (instrs BCAX, EOR3, RAX1, XAR)>;
+
+// Crypto SM3 ops
+def : InstRW<[N2Write_2cyc_1V0], (instregex "^SM3PARTW[12]$", "^SM3SS1$",
+ "^SM3TT[12][AB]$")>;
+
+// Crypto SM4 ops
+def : InstRW<[N2Write_4cyc_1V0], (instrs SM4E, SM4ENCKEY)>;
+
+// CRC
+// -----------------------------------------------------------------------------
+
+def : InstRW<[N2Write_2cyc_1M0], (instregex "^CRC32")>;
+
+// SVE Predicate instructions
+// -----------------------------------------------------------------------------
+
+// Loop control, based on predicate
+def : InstRW<[N2Write_2cyc_1M], (instrs BRKA_PPmP, BRKA_PPzP,
+ BRKB_PPmP, BRKB_PPzP)>;
+
+// Loop control, based on predicate and flag setting
+def : InstRW<[N2Write_3cyc_1M], (instrs BRKAS_PPzP, BRKBS_PPzP)>;
+
+// Loop control, propagating
+def : InstRW<[N2Write_2cyc_1M0], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>;
+
+// Loop control, propagating and flag setting
+def : InstRW<[N2Write_3cyc_1M0_1M], (instrs BRKNS_PPzP, BRKPAS_PPzPP,
+ BRKPBS_PPzPP)>;
+
+// Loop control, based on GPR
+def : InstRW<[N2Write_3cyc_1M],
+ (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]$")>;
+
+def : InstRW<[N2Write_3cyc_1M], (instregex "^WHILE(RW|WR)_PXX_[BHSD]$")>;
+
+// Loop terminate
+def : InstRW<[N2Write_1cyc_1M], (instregex "^CTERM(EQ|NE)_(WW|XX)$")>;
+
+// Predicate counting scalar
+def : InstRW<[N2Write_2cyc_1M], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>;
+def : InstRW<[N2Write_2cyc_1M],
+ (instregex "^(CNT|DEC|INC|SQDEC|SQINC|UQDEC|UQINC)[BHWD]_XPiI$",
+ "^SQ(DEC|INC)[BHWD]_XPiWdI$",
+ "^(UQDEC|UQINC)[BHWD]_WPiI$")>;
+
+// Predicate counting scalar, active predicate
+def : InstRW<[N2Write_2cyc_1M],
+ (instregex "^CNTP_XPP_[BHSD]$",
+ "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]$",
+ "^(UQDEC|UQINC)P_WP_[BHSD]$",
+ "^(SQDEC|SQINC|UQDEC|UQINC)P_XPWd_[BHSD]$")>;
+
+// Predicate counting vector, active predicate
+def : InstRW<[N2Write_7cyc_1M_1M0_1V],
+ (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]$")>;
+
+// Predicate logical
+def : InstRW<[N2Write_1cyc_1M0],
+ (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP$")>;
+
+// Predicate logical, flag setting
+def : InstRW<[N2Write_2cyc_1M0_1M],
+ (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP$")>;
+
+// Predicate reverse
+def : InstRW<[N2Write_2cyc_1M], (instregex "^REV_PP_[BHSD]$")>;
+
+// Predicate select
+def : InstRW<[N2Write_1cyc_1M0], (instrs SEL_PPPP)>;
+
+// Predicate set
+def : InstRW<[N2Write_2cyc_1M], (instregex "^PFALSE$", "^PTRUE_[BHSD]$")>;
+
+// Predicate set/initialize, set flags
+def : InstRW<[N2Write_3cyc_1M], (instregex "^PTRUES_[BHSD]$")>;
+
+// Predicate find first/next
+def : InstRW<[N2Write_3cyc_1M], (instregex "^PFIRST_B$", "^PNEXT_[BHSD]$")>;
+
+// Predicate test
+def : InstRW<[N2Write_1cyc_1M], (instrs PTEST_PP)>;
+
+// Predicate transpose
+def : InstRW<[N2Write_2cyc_1M], (instregex "^TRN[12]_PPP_[BHSDQ]$")>;
+
+// Predicate unpack and widen
+def : InstRW<[N2Write_2cyc_1M], (instrs PUNPKHI_PP, PUNPKLO_PP)>;
+
+// Predicate zip/unzip
+def : InstRW<[N2Write_2cyc_1M], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]$")>;
+
+// SVE integer instructions
+// -----------------------------------------------------------------------------
+
+// Arithmetic, absolute diff
+def : InstRW<[N2Write_2cyc_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]$")>;
+
+// Arithmetic, absolute diff accum
+def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>;
+
+// Arithmetic, absolute diff accum long
+def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>;
+
+// Arithmetic, absolute diff long
+def : InstRW<[N2Write_2cyc_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]$")>;
+
+// Arithmetic, basic
+def : InstRW<[N2Write_2cyc_1V],
+ (instregex "^(ABS|ADD|CNOT|NEG|SUB|SUBR)_ZPmZ_[BHSD]$",
+ "^(ADD|SUB)_ZZZ_[BHSD]$",
+ "^(ADD|SUB|SUBR)_ZI_[BHSD]$",
+ "^ADR_[SU]XTW_ZZZ_D_[0123]$",
+ "^ADR_LSL_ZZZ_[SD]_[0123]$",
+ "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]$",
+ "^SADDLBT_ZZZ_[HSD]$",
+ "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]$",
+ "^SSUBL(BT|TB)_ZZZ_[HSD]$")>;
+
+// Arithmetic, complex
+def : InstRW<[N2Write_2cyc_1V],
+ (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]$",
+ "^SQ(ABS|ADD|NEG|SUB|SUBR)_ZPmZ_[BHSD]$",
+ "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]$",
+ "^[SU]Q(ADD|SUB)_ZI_[BHSD]$",
+ "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]$",
+ "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]$")>;
+
+// Arithmetic, large integer
+def : InstRW<[N2Write_2cyc_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]$")>;
+
+// Arithmetic, pairwise add
+def : InstRW<[N2Write_2cyc_1V], (instregex "^ADDP_ZPmZ_[BHSD]$")>;
+
+// Arithmetic, pairwise add and accum long
+def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>;
+
+// Arithmetic, shift
+def : InstRW<[N2Write_2cyc_1V1],
+ (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]$",
+ "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]$",
+ "^(ASR|LSL|LSR)_ZPmI_[BHSD]$",
+ "^(ASR|LSL|LSR)_ZPmZ_[BHSD]$",
+ "^(ASR|LSL|LSR)_ZZI_[BHSD]$",
+ "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]$")>;
+
+// Arithmetic, shift and accumulate
+def : InstRW<[N2Write_4cyc_1V1],
+ (instregex "^(SRSRA|SSRA|URSRA|USRA)_ZZI_[BHSD]$")>;
+
+// Arithmetic, shift by immediate
+// Arithmetic, shift by immediate and insert
+def : InstRW<[N2Write_2cyc_1V1],
+ (instregex "^(SHRNB|SHRNT|SSHLLB|SSHLLT|USHLLB|USHLLT|SLI|SRI)_ZZI_[BHSD]$")>;
+
+// Arithmetic, shift complex
+def : InstRW<[N2Write_4cyc_1V1],
+ (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]$",
+ "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_ZPmZ_[BHSD]$",
+ "^(SQSHL|SQSHLU|UQSHL)_ZPmI_[BHSD]$",
+ "^SQSHRU?N[BT]_ZZI_[BHS]$",
+ "^UQR?SHRN[BT]_ZZI_[BHS]$")>;
+
+// Arithmetic, shift right for divide
+def : InstRW<[N2Write_4cyc_1V1], (instregex "^ASRD_ZPmI_[BHSD]$")>;
+
+// Arithmetic, shift rounding
+def : InstRW<[N2Write_4cyc_1V1],
+ (instregex "^(SRSHL|SRSHLR|URSHL|URSHLR)_ZPmZ_[BHSD]$",
+ "^[SU]RSHR_ZPmI_[BHSD]$")>;
+
+// Bit manipulation
+def : InstRW<[N2Write_6cyc_2V1],
+ (instregex "^(BDEP|BEXT|BGRP)_ZZZ_[BHSD]$")>;
+
+// Bitwise select
+def : InstRW<[N2Write_2cyc_1V], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ$")>;
+
+// Count/reverse bits
+def : InstRW<[N2Write_2cyc_1V], (instregex "^(CLS|CLZ|CNT|RBIT)_ZPmZ_[BHSD]$")>;
+
+// Broadcast logical bitmask immediate to vector
+def : InstRW<[N2Write_2cyc_1V], (instrs DUPM_ZI)>;
+
+// Compare and set flags
+def : InstRW<[N2Write_4cyc_1V0_1M],
+ (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]$",
+ "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]$")>;
+
+// Complex add
+def : InstRW<[N2Write_2cyc_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]$")>;
+
+// Complex dot product 8-bit element
+def : InstRW<[N2Write_3cyc_1V], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
+
+// Complex dot product 16-bit element
+def : InstRW<[N2Write_4cyc_1V0], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
+
+// Complex multiply-add B, H, S element size
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^CMLA_ZZZ_[BHS]$",
+ "^CMLA_ZZZI_[HS]$")>;
+
+// Complex multiply-add D element size
+def : InstRW<[N2Write_5cyc_2V0], (instrs CMLA_ZZZ_D)>;
+
+// Conditional extract operations, scalar form
+def : InstRW<[N2Write_8cyc_1M0_1V1_1V], (instregex "^CLAST[AB]_RPZ_[BHSD]$")>;
+
+// Conditional extract operations, SIMD&FP scalar and vector forms
+def : InstRW<[N2Write_3cyc_1V1], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]$",
+ "^COMPACT_ZPZ_[SD]$",
+ "^SPLICE_ZPZZ?_[BHSD]$")>;
+
+// Convert to floating point, 64b to float or convert to double
+def : InstRW<[N2Write_3cyc_1V0], (instregex "^[SU]CVTF_ZPmZ_Dto[SD]$")>;
+
+// Convert to floating point, 64b to half
+def : InstRW<[N2Write_3cyc_1V0], (instregex "^[SU]CVTF_ZPmZ_DtoH$")>;
+
+// Convert to floating point, 32b to single or half
+def : InstRW<[N2Write_4cyc_2V0], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]$")>;
+
+// Convert to floating point, 32b to double
+def : InstRW<[N2Write_3cyc_1V0], (instregex "^[SU]CVTF_ZPmZ_StoD$")>;
+
+// Convert to floating point, 16b to half
+def : InstRW<[N2Write_6cyc_4V0], (instregex "^[SU]CVTF_ZPmZ_HtoH$")>;
+
+// Copy, scalar
+def : InstRW<[N2Write_5cyc_1M0_1V], (instregex "^CPY_ZPmR_[BHSD]$")>;
+
+// Copy, scalar SIMD&FP or imm
+def : InstRW<[N2Write_2cyc_1V], (instregex "^CPY_ZPm[IV]_[BHSD]$",
+ "^CPY_ZPzI_[BHSD]$")>;
+
+// Divides, 32 bit
+def : InstRW<[N2Write_12cyc_1V0], (instregex "^[SU]DIVR?_ZPmZ_S$")>;
+
+// Divides, 64 bit
+def : InstRW<[N2Write_20cyc_1V0], (instregex "^[SU]DIVR?_ZPmZ_D$")>;
+
+// Dot product, 8 bit
+def : InstRW<[N2Write_3cyc_1V], (instregex "^[SU]DOT_ZZZI?_S$")>;
+
+// Dot product, 8 bit, using signed and unsigned integers
+def : InstRW<[N2Write_3cyc_1V], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
+
+// Dot product, 16 bit
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>;
+
+// Duplicate, immediate and indexed form
+def : InstRW<[N2Write_2cyc_1V], (instregex "^DUP_ZI_[BHSD]$",
+ "^DUP_ZZI_[BHSDQ]$")>;
+
+// Duplicate, scalar form
+def : InstRW<[N2Write_3cyc_1M0], (instregex "^DUP_ZR_[BHSD]$")>;
+
+// Extend, sign or zero
+def : InstRW<[N2Write_2cyc_1V1], (instregex "^[SU]XTB_ZPmZ_[HSD]$",
+ "^[SU]XTH_ZPmZ_[SD]$",
+ "^[SU]XTW_ZPmZ_[D]$")>;
+
+// Extract
+def : InstRW<[N2Write_2cyc_1V], (instrs EXT_ZZI, EXT_ZZI_B)>;
+
+// Extract narrow saturating
+def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]$",
+ "^SQXTUN[BT]_ZZ_[BHS]$")>;
+
+// Extract/insert operation, SIMD and FP scalar form
+def : InstRW<[N2Write_3cyc_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]$",
+ "^INSR_ZV_[BHSD]$")>;
+
+// Extract/insert operation, scalar
+def : InstRW<[N2Write_5cyc_1V1_1M0], (instregex "^LAST[AB]_RPZ_[BHSD]$",
+ "^INSR_ZR_[BHSD]$")>;
+
+// Histogram operations
+def : InstRW<[N2Write_2cyc_1V], (instregex "^HISTCNT_ZPzZZ_[SD]$",
+ "^HISTSEG_ZZZ$")>;
+
+// Horizontal operations, B, H, S form, immediate operands only
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^INDEX_II_[BHS]$")>;
+
+// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar
+// operands only / immediate, scalar operands
+def : InstRW<[N2Write_7cyc_1M0_1V0], (instregex "^INDEX_(IR|RI|RR)_[BHS]$")>;
+
+// Horizontal operations, D form, immediate operands only
+def : InstRW<[N2Write_5cyc_2V0], (instrs INDEX_II_D)>;
+
+// Horizontal operations, D form, scalar, immediate operands)/ scalar operands
+// only / immediate, scalar operands
+def : InstRW<[N2Write_8cyc_2M0_2V0], (instregex "^INDEX_(IR|RI|RR)_D$")>;
+
+// Logical
+def : InstRW<[N2Write_2cyc_1V],
+ (instregex "^(AND|EOR|ORR)_ZI$",
+ "^(AND|BIC|EOR|EOR(BT|TB)?|ORR)_ZZZ$",
+ "^EOR(BT|TB)_ZZZ_[BHSD]$",
+ "^(AND|BIC|EOR|NOT|ORR)_ZPmZ_[BHSD]$")>;
+
+// Max/min, basic and pairwise
+def : InstRW<[N2Write_2cyc_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]$",
+ "^[SU](MAX|MIN)P?_ZPmZ_[BHSD]$")>;
+
+// Matching operations
+def : InstRW<[N2Write_2cyc_1V0_1M], (instregex "^N?MATCH_PPzZZ_[BH]$")>;
+
+// Matrix multiply-accumulate
+def : InstRW<[N2Write_3cyc_1V], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+
+// Move prefix
+def : InstRW<[N2Write_2cyc_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
+ "^MOVPRFX_ZZ$")>;
+
+// Multiply, B, H, S element size
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]$",
+ "^[SU]MULH_(ZPmZ|ZZZ)_[BHS]$")>;
+
+// Multiply, D element size
+def : InstRW<[N2Write_5cyc_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D$",
+ "^[SU]MULH_(ZPmZ|ZZZ)_D$")>;
+
+// Multiply long
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^[SU]MULL[BT]_ZZZI_[SD]$",
+ "^[SU]MULL[BT]_ZZZ_[HSD]$")>;
+
+// Multiply accumulate, B, H, S element size
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^ML[AS]_ZZZI_[BHS]$",
+ "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]$")>;
+
+// Multiply accumulate, D element size
+def : InstRW<[N2Write_5cyc_2V0], (instregex "^ML[AS]_ZZZI_D$",
+ "^(ML[AS]|MAD|MSB)_ZPmZZ_D$")>;
+
+// Multiply accumulate long
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$",
+ "^[SU]ML[AS]L[BT]_ZZZI_[SD]$")>;
+
+// Multiply accumulate saturating doubling long regular
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$",
+ "^SQDML[AS](LB|LT)_ZZZI_[SD]$")>;
+
+// Multiply saturating doubling high, B, H, S element size
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQDMULH_ZZZ_[BHS]$",
+ "^SQDMULH_ZZZI_[HS]$")>;
+
+// Multiply saturating doubling high, D element size
+def : InstRW<[N2Write_5cyc_2V0], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>;
+
+// Multiply saturating doubling long
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQDMULL[BT]_ZZZ_[HSD]$",
+ "^SQDMULL[BT]_ZZZI_[SD]$")>;
+
+// Multiply saturating rounding doubling regular/complex accumulate, B, H, S
+// element size
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$",
+ "^SQRDCMLAH_ZZZ_[BHS]$",
+ "^SQRDML[AS]H_ZZZI_[HS]$",
+ "^SQRDCMLAH_ZZZI_[HS]$")>;
+
+// Multiply saturating rounding doubling regular/complex accumulate, D element
+// size
+def : InstRW<[N2Write_5cyc_2V0], (instregex "^SQRDML[AS]H_ZZZI?_D$",
+ "^SQRDCMLAH_ZZZ_D$")>;
+
+// Multiply saturating rounding doubling regular/complex, B, H, S element size
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQRDMULH_ZZZ_[BHS]$",
+ "^SQRDMULH_ZZZI_[HS]$")>;
+
+// Multiply saturating rounding doubling regular/complex, D element size
+def : InstRW<[N2Write_5cyc_2V0], (instregex "^SQRDMULH_ZZZI?_D$")>;
+
+// Multiply/multiply long, (8x8) polynomial
+def : InstRW<[N2Write_2cyc_1V0], (instregex "^PMUL_ZZZ_B$",
+ "^PMULL[BT]_ZZZ_[HDQ]$")>;
+
+// Predicate counting vector
+def : InstRW<[N2Write_2cyc_1V0],
+ (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI$")>;
+
+// Reciprocal estimate
+def : InstRW<[N2Write_4cyc_2V0], (instrs URECPE_ZPmZ_S, URSQRTE_ZPmZ_S)>;
+
+// Reduction, arithmetic, B form
+def : InstRW<[N2Write_11cyc_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>;
+
+// Reduction, arithmetic, H form
+def : InstRW<[N2Write_9cyc_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>;
+
+// Reduction, arithmetic, S form
+def : InstRW<[N2Write_8cyc_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>;
+
+// Reduction, arithmetic, D form
+def : InstRW<[N2Write_8cyc_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>;
+
+// Reduction, logical
+def : InstRW<[N2Write_6cyc_1V_1V1], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]$")>;
+
+// Reverse, vector
+def : InstRW<[N2Write_2cyc_1V], (instregex "^REV_ZZ_[BHSD]$",
+ "^REVB_ZPmZ_[HSD]$",
+ "^REVH_ZPmZ_[SD]$",
+ "^REVW_ZPmZ_D$")>;
+
+// Select, vector form
+def : InstRW<[N2Write_2cyc_1V], (instregex "^SEL_ZPZZ_[BHSD]$")>;
+
+// Table lookup
+def : InstRW<[N2Write_2cyc_1V], (instregex "^TBL_ZZZZ?_[BHSD]$")>;
+
+// Table lookup extension
+def : InstRW<[N2Write_2cyc_1V], (instregex "^TBX_ZZZ_[BHSD]$")>;
+
+// Transpose, vector form
+def : InstRW<[N2Write_2cyc_1V], (instregex "^TRN[12]_ZZZ_[BHSDQ]$")>;
+
+// Unpack and extend
+def : InstRW<[N2Write_2cyc_1V], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]$")>;
+
+// Zip/unzip
+def : InstRW<[N2Write_2cyc_1V], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]$")>;
+
+// SVE floating-point instructions
+// -----------------------------------------------------------------------------
+
+// Floating point absolute value/difference
+def : InstRW<[N2Write_2cyc_1V], (instregex "^FAB[SD]_ZPmZ_[HSD]$")>;
+
+// Floating point arithmetic
+def : InstRW<[N2Write_2cyc_1V], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]$",
+ "^FADDP_ZPmZZ_[HSD]$",
+ "^FNEG_ZPmZ_[HSD]$",
+ "^FSUBR_ZPm[IZ]_[HSD]$")>;
+
+// Floating point associative add, F16
+def : InstRW<[N2Write_10cyc_1V1], (instrs FADDA_VPZ_H)>;
+
+// Floating point associative add, F32
+def : InstRW<[N2Write_6cyc_1V1], (instrs FADDA_VPZ_S)>;
+
+// Floating point associative add, F64
+def : InstRW<[N2Write_4cyc_1V], (instrs FADDA_VPZ_D)>;
+
+// Floating point compare
+def : InstRW<[N2Write_2cyc_1V0], (instregex "^FACG[ET]_PPzZZ_[HSD]$",
+ "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]$",
+ "^FCM(LE|LT)_PPzZ0_[HSD]$",
+ "^FCMUO_PPzZZ_[HSD]$")>;
+
+// Floating point complex add
+def : InstRW<[N2Write_3cyc_1V], (instregex "^FCADD_ZPmZ_[HSD]$")>;
+
+// Floating point complex multiply add
+def : InstRW<[N2Write_5cyc_1V], (instregex "^FCMLA_ZPmZZ_[HSD]$",
+ "^FCMLA_ZZZI_[HS]$")>;
+
+// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
+def : InstRW<[N2Write_4cyc_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)$",
+ "^FCVTLT_ZPmZ_HtoS$",
+ "^FCVTNT_ZPmZ_StoH$")>;
+
+// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32
+// or F64 to F16)
+def : InstRW<[N2Write_3cyc_1V0], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)$",
+ "^FCVTLT_ZPmZ_StoD$",
+ "^FCVTNT_ZPmZ_DtoS$")>;
+
+// Floating point convert, round to odd
+def : InstRW<[N2Write_3cyc_1V0], (instrs FCVTX_ZPmZ_DtoS, FCVTXNT_ZPmZ_DtoS)>;
+
+// Floating point base2 log, F16
+def : InstRW<[N2Write_6cyc_4V0], (instrs FLOGB_ZPmZ_H)>;
+
+// Floating point base2 log, F32
+def : InstRW<[N2Write_4cyc_2V0], (instrs FLOGB_ZPmZ_S)>;
+
+// Floating point base2 log, F64
+def : InstRW<[N2Write_3cyc_1V0], (instrs FLOGB_ZPmZ_D)>;
+
+// Floating point convert to integer, F16
+def : InstRW<[N2Write_6cyc_4V0], (instregex "^FCVTZ[SU]_ZPmZ_HtoH$")>;
+
+// Floating point convert to integer, F32
+def : InstRW<[N2Write_4cyc_2V0], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)$")>;
+
+// Floating point convert to integer, F64
+def : InstRW<[N2Write_3cyc_1V0],
+ (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)$")>;
+
+// Floating point copy
+def : InstRW<[N2Write_2cyc_1V], (instregex "^FCPY_ZPmI_[HSD]$",
+ "^FDUP_ZI_[HSD]$")>;
+
+// Floating point divide, F16
+def : InstRW<[N2Write_13cyc_1V0], (instregex "^FDIVR?_ZPmZ_H$")>;
+
+// Floating point divide, F32
+def : InstRW<[N2Write_10cyc_1V0], (instregex "^FDIVR?_ZPmZ_S$")>;
+
+// Floating point divide, F64
+def : InstRW<[N2Write_15cyc_1V0], (instregex "^FDIVR?_ZPmZ_D$")>;
+
+// Floating point min/max pairwise
+def : InstRW<[N2Write_2cyc_1V], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]$")>;
+
+// Floating point min/max
+def : InstRW<[N2Write_2cyc_1V], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]$")>;
+
+// Floating point multiply
+def : InstRW<[N2Write_3cyc_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]$",
+ "^FMUL_(ZPm[IZ]|ZZZI?)_[HSD]$")>;
+
+// Floating point multiply accumulate
+def : InstRW<[N2Write_4cyc_1V],
+ (instregex "^FML[AS]_(ZPmZZ|ZZZI)_[HSD]$",
+ "^(FMAD|FNMAD|FNML[AS]|FN?MSB)_ZPmZZ_[HSD]$")>;
+
+// Floating point multiply add/sub accumulate long
+def : InstRW<[N2Write_4cyc_1V], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>;
+
+// Floating point reciprocal estimate, F16
+def : InstRW<[N2Write_6cyc_4V0], (instrs FRECPE_ZZ_H, FRECPX_ZPmZ_H,
+ FRSQRTE_ZZ_H)>;
+
+// Floating point reciprocal estimate, F32
+def : InstRW<[N2Write_4cyc_2V0], (instrs FRECPE_ZZ_S, FRECPX_ZPmZ_S,
+ FRSQRTE_ZZ_S)>;
+
+// Floating point reciprocal estimate, F64
+def : InstRW<[N2Write_3cyc_1V0], (instrs FRECPE_ZZ_D, FRECPX_ZPmZ_D,
+ FRSQRTE_ZZ_D)>;
+
+// Floating point reciprocal step
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]$")>;
+
+// Floating point reduction, F16
+def : InstRW<[N2Write_6cyc_2V],
+ (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_H$")>;
+
+// Floating point reduction, F32
+def : InstRW<[N2Write_4cyc_1V],
+ (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_S$")>;
+
+// Floating point reduction, F64
+def : InstRW<[N2Write_2cyc_1V],
+ (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_D$")>;
+
+// Floating point round to integral, F16
+def : InstRW<[N2Write_6cyc_4V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H$")>;
+
+// Floating point round to integral, F32
+def : InstRW<[N2Write_4cyc_2V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S$")>;
+
+// Floating point round to integral, F64
+def : InstRW<[N2Write_3cyc_1V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D$")>;
+
+// Floating point square root, F16
+def : InstRW<[N2Write_13cyc_1V0], (instrs FSQRT_ZPmZ_H)>;
+
+// Floating point square root, F32
+def : InstRW<[N2Write_10cyc_1V0], (instrs FSQRT_ZPmZ_S)>;
+
+// Floating point square root, F64
+def : InstRW<[N2Write_16cyc_1V0], (instrs FSQRT_ZPmZ_D)>;
+
+// Floating point trigonometric exponentiation
+def : InstRW<[N2Write_3cyc_1V1], (instregex "^FEXPA_ZZ_[HSD]$")>;
+
+// Floating point trigonometric multiply add
+def : InstRW<[N2Write_4cyc_1V], (instregex "^FTMAD_ZZI_[HSD]$")>;
+
+// Floating point trigonometric, miscellaneous
+def : InstRW<[N2Write_3cyc_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]$")>;
+
+// SVE BFloat16 (BF16) instructions
+// -----------------------------------------------------------------------------
+
+// Convert, F32 to BF16
+def : InstRW<[N2Write_3cyc_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
+
+// Dot product
+def : InstRW<[N2Write_4cyc_1V], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+
+// Matrix multiply accumulate
+def : InstRW<[N2Write_5cyc_1V], (instrs BFMMLA_ZZZ)>;
+
+// Multiply accumulate long
+def : InstRW<[N2Write_4cyc_1V], (instregex "^BFMLAL[BT]_ZZ[ZI]$")>;
+
+// SVE Load instructions
+// -----------------------------------------------------------------------------
+
+// Load vector
+def : InstRW<[N2Write_6cyc_1L], (instrs LDR_ZXI)>;
+
+// Load predicate
+def : InstRW<[N2Write_6cyc_1L_1M], (instrs LDR_PXI)>;
+
+// Contiguous load, scalar + imm
+def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1[BHWD]_IMM_REAL$",
+ "^LD1S?B_[HSD]_IMM_REAL$",
+ "^LD1S?H_[SD]_IMM_REAL$",
+ "^LD1S?W_D_IMM_REAL$" )>;
+// Contiguous load, scalar + scalar
+def : InstRW<[N2Write_6cyc_1L01], (instregex "^LD1[BHWD]$",
+ "^LD1S?B_[HSD]$",
+ "^LD1S?H_[SD]$",
+ "^LD1S?W_D$" )>;
+
+// Contiguous load broadcast, scalar + imm
+def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1R[BHWD]_IMM$",
+ "^LD1RSW_IMM$",
+ "^LD1RS?B_[HSD]_IMM$",
+ "^LD1RS?H_[SD]_IMM$",
+ "^LD1RS?W_D_IMM$",
+ "^LD1RQ_[BHWD]_IMM$")>;
+
+// Contiguous load broadcast, scalar + scalar
+def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1RQ_[BHWD]$")>;
+
+// Non temporal load, scalar + imm
+def : InstRW<[N2Write_6cyc_1L], (instregex "^LDNT1[BHWD]_ZRI$")>;
+
+// Non temporal load, scalar + scalar
+def : InstRW<[N2Write_6cyc_1L_1S], (instregex "^LDNT1[BHWD]_ZRR$")>;
+
+// Non temporal gather load, vector + scalar 32-bit element size
+def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^LDNT1[BHW]_ZZR_S_REAL$",
+ "^LDNT1S[BH]_ZZR_S_REAL$")>;
+
+// Non temporal gather load, vector + scalar 64-bit element size
+def : InstRW<[N2Write_10cyc_2L_2V1], (instregex "^LDNT1S?[BHW]_ZZR_D_REAL$")>;
+def : InstRW<[N2Write_10cyc_2L_2V1], (instrs LDNT1D_ZZR_D_REAL)>;
+
+// Contiguous first faulting load, scalar + scalar
+def : InstRW<[N2Write_6cyc_1L_1S], (instregex "^LDFF1[BHWD]_REAL$",
+ "^LDFF1S?B_[HSD]_REAL$",
+ "^LDFF1S?H_[SD]_REAL$",
+ "^LDFF1S?W_D_REAL$")>;
+
+// Contiguous non faulting load, scalar + imm
+def : InstRW<[N2Write_6cyc_1L], (instregex "^LDNF1[BHWD]_IMM_REAL$",
+ "^LDNF1S?B_[HSD]_IMM_REAL$",
+ "^LDNF1S?H_[SD]_IMM_REAL$",
+ "^LDNF1S?W_D_IMM_REAL$")>;
+
+// Contiguous Load two structures to two vectors, scalar + imm
+def : InstRW<[N2Write_8cyc_1L_1V], (instregex "^LD2[BHWD]_IMM$")>;
+
+// Contiguous Load two structures to two vectors, scalar + scalar
+def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^LD2[BHWD]$")>;
+
+// Contiguous Load three structures to three vectors, scalar + imm
+def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^LD3[BHWD]_IMM$")>;
+
+// Contiguous Load three structures to three vectors, scalar + scalar
+def : InstRW<[N2Write_10cyc_1V_1L_1S], (instregex "^LD3[BHWD]$")>;
+
+// Contiguous Load four structures to four vectors, scalar + imm
+def : InstRW<[N2Write_9cyc_2L_2V], (instregex "^LD4[BHWD]_IMM$")>;
+
+// Contiguous Load four structures to four vectors, scalar + scalar
+def : InstRW<[N2Write_10cyc_2L_2V_2S], (instregex "^LD4[BHWD]$")>;
+
+// Gather load, vector + imm, 32-bit element size
+def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^GLD(FF)?1S?[BH]_S_IMM_REAL$",
+ "^GLD(FF)?1W_IMM_REAL$")>;
+
+// Gather load, vector + imm, 64-bit element size
+def : InstRW<[N2Write_9cyc_2L_2V], (instregex "^GLD(FF)?1S?[BHW]_D_IMM_REAL$",
+ "^GLD(FF)?1D_IMM_REAL$")>;
+
+// Gather load, 64-bit element size
+def : InstRW<[N2Write_9cyc_2L_2V],
+ (instregex "^GLD(FF)?1S?[BHW]_D_[SU]XTW_(SCALED_)?REAL$",
+ "^GLD(FF)?1S?[BHW]_D_(SCALED_)?REAL$",
+ "^GLD(FF)?1D_[SU]XTW_(SCALED_)?REAL$",
+ "^GLD(FF)?1D_(SCALED_)?REAL$")>;
+
+// Gather load, 32-bit scaled offset
+def : InstRW<[N2Write_10cyc_2L_2V],
+ (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED_REAL$",
+ "^GLD(FF)?1W_[SU]XTW_SCALED_REAL")>;
+
+// Gather load, 32-bit unpacked unscaled offset
+def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW_REAL$",
+ "^GLD(FF)?1W_[SU]XTW_REAL$")>;
+
+// SVE Store instructions
+// -----------------------------------------------------------------------------
+
+// Store from predicate reg
+def : InstRW<[N2Write_1cyc_1L01], (instrs STR_PXI)>;
+
+// Store from vector reg
+def : InstRW<[N2Write_2cyc_1L01_1V], (instrs STR_ZXI)>;
+
+// Contiguous store, scalar + imm
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^ST1[BHWD]_IMM$",
+ "^ST1B_[HSD]_IMM$",
+ "^ST1H_[SD]_IMM$",
+ "^ST1W_D_IMM$")>;
+
+// Contiguous store, scalar + scalar
+def : InstRW<[N2Write_2cyc_1L01_1S_1V], (instregex "^ST1H(_[SD])?$")>;
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^ST1[BWD]$",
+ "^ST1B_[HSD]$",
+ "^ST1W_D$")>;
+
+// Contiguous store two structures from two vectors, scalar + imm
+def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "^ST2[BHWD]_IMM$")>;
+
+// Contiguous store two structures from two vectors, scalar + scalar
+def : InstRW<[N2Write_4cyc_1L01_1S_1V], (instrs ST2H)>;
+
+// Contiguous store two structures from two vectors, scalar + scalar
+def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "^ST2[BWD]$")>;
+
+// Contiguous store three structures from three vectors, scalar + imm
+def : InstRW<[N2Write_7cyc_5L01_5V], (instregex "^ST3[BHWD]_IMM$")>;
+
+// Contiguous store three structures from three vectors, scalar + scalar
+def : InstRW<[N2Write_7cyc_5L01_5S_5V], (instrs ST3H)>;
+
+// Contiguous store three structures from three vectors, scalar + scalar
+def : InstRW<[N2Write_7cyc_5L01_5S_5V], (instregex "^ST3[BWD]$")>;
+
+// Contiguous store four structures from four vectors, scalar + imm
+def : InstRW<[N2Write_11cyc_9L01_9V], (instregex "^ST4[BHWD]_IMM$")>;
+
+// Contiguous store four structures from four vectors, scalar + scalar
+def : InstRW<[N2Write_11cyc_9L01_9S_9V], (instrs ST4H)>;
+
+// Contiguous store four structures from four vectors, scalar + scalar
+def : InstRW<[N2Write_11cyc_9L01_9S_9V], (instregex "^ST4[BWD]$")>;
+
+// Non temporal store, scalar + imm
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STNT1[BHWD]_ZRI$")>;
+
+// Non temporal store, scalar + scalar
+def : InstRW<[N2Write_2cyc_1L01_1S_1V], (instrs STNT1H_ZRR)>;
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STNT1[BWD]_ZRR$")>;
+
+// Scatter non temporal store, vector + scalar 32-bit element size
+def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "^STNT1[BHW]_ZZR_S")>;
+
+// Scatter non temporal store, vector + scalar 64-bit element size
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STNT1[BHWD]_ZZR_D")>;
+
+// Scatter store vector + imm 32-bit element size
+def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "^SST1[BH]_S_IMM$",
+ "^SST1W_IMM$")>;
+
+// Scatter store vector + imm 64-bit element size
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[BHW]_D_IMM$",
+ "^SST1D_IMM$")>;
+
+// Scatter store, 32-bit scaled offset
+def : InstRW<[N2Write_4cyc_2L01_2V],
+ (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>;
+
+// Scatter store, 32-bit unpacked unscaled offset
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[BHW]_D_[SU]XTW$",
+ "^SST1D_[SU]XTW$")>;
+
+// Scatter store, 32-bit unpacked scaled offset
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
+ "^SST1D_[SU]XTW_SCALED$")>;
+
+// Scatter store, 32-bit unscaled offset
+def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "^SST1[BH]_S_[SU]XTW$",
+ "^SST1W_[SU]XTW$")>;
+
+// Scatter store, 64-bit scaled offset
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[HW]_D_SCALED$",
+ "^SST1D_SCALED$")>;
+
+// Scatter store, 64-bit unscaled offset
+def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[BHW]_D$",
+ "^SST1D$")>;
+
+// SVE Miscellaneous instructions
+// -----------------------------------------------------------------------------
+
+// Read first fault register, unpredicated
+def : InstRW<[N2Write_2cyc_1M0], (instrs RDFFR_P_REAL)>;
+
+// Read first fault register, predicated
+def : InstRW<[N2Write_3cyc_1M0_1M], (instrs RDFFR_PPz_REAL)>;
+
+// Read first fault register and set flags
+def : InstRW<[N2Write_4cyc_2M0_2M], (instrs RDFFRS_PPz)>;
+
+// Set first fault register
+// Write to first fault register
+def : InstRW<[N2Write_2cyc_1M0], (instrs SETFFR, WRFFR)>;
+
+// Prefetch
+def : InstRW<[N2Write_4cyc_1L], (instregex "^PRF[BHWD]")>;
+
+// SVE Cryptographic instructions
+// -----------------------------------------------------------------------------
+
+// Crypto AES ops
+def : InstRW<[N2Write_2cyc_1V], (instregex "^AES[DE]_ZZZ_B$",
+ "^AESI?MC_ZZ_B$")>;
+
+// Crypto SHA3 ops
+def : InstRW<[N2Write_2cyc_1V0], (instregex "^(BCAX|EOR3)_ZZZZ$",
+ "^RAX1_ZZZ_D$",
+ "^XAR_ZZZI_[BHSD]$")>;
+
+// Crypto SM4 ops
+def : InstRW<[N2Write_4cyc_1V0], (instregex "^SM4E(KEY)?_ZZZ_S$")>;
+
+}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
index 6ecfc97a4273..9c1bf3231a55 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
@@ -26,7 +26,8 @@ def TSV110Model : SchedMachineModel {
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
PAUnsupported.F,
- SMEUnsupported.F);
+ SMEUnsupported.F,
+ [HasMTE]);
}
// Define each kind of processor resource and number available on the TSV110,
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
index ff34c0ce9a0c..8b380ae0e8f3 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -27,7 +27,8 @@ def ThunderXT8XModel : SchedMachineModel {
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
PAUnsupported.F,
- SMEUnsupported.F);
+ SMEUnsupported.F,
+ [HasMTE]);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index ffa0a5e7d91a..cdafa33da054 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -27,7 +27,8 @@ def ThunderX2T99Model : SchedMachineModel {
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
PAUnsupported.F,
- SMEUnsupported.F);
+ SMEUnsupported.F,
+ [HasMTE]);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
index 46a1c217f984..5b1e9b5bcf23 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
@@ -25,7 +25,8 @@ def ThunderX3T110Model : SchedMachineModel {
let CompleteModel = 1;
list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
- PAUnsupported.F);
+ PAUnsupported.F,
+ [HasMTE]);
// FIXME: Remove when all errors have been fixed.
let FullInstRWOverlapCheck = 0;
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 41c7a8c5042f..274a025e82a0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -796,6 +796,50 @@ static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
return IC.replaceInstUsesWith(II, Extract);
}
+static Optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
+ IntrinsicInst &II) {
+ // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
+ // integer variant across a variety of micro-architectures. Replace scalar
+ // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
+ // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
+ // depending on the micro-architecture, but has been observed as generally
+ // being faster, particularly when the CLAST[AB] op is a loop-carried
+ // dependency.
+ IRBuilder<> Builder(II.getContext());
+ Builder.SetInsertPoint(&II);
+ Value *Pg = II.getArgOperand(0);
+ Value *Fallback = II.getArgOperand(1);
+ Value *Vec = II.getArgOperand(2);
+ Type *Ty = II.getType();
+
+ if (!Ty->isIntegerTy())
+ return None;
+
+ Type *FPTy;
+ switch (cast<IntegerType>(Ty)->getBitWidth()) {
+ default:
+ return None;
+ case 16:
+ FPTy = Builder.getHalfTy();
+ break;
+ case 32:
+ FPTy = Builder.getFloatTy();
+ break;
+ case 64:
+ FPTy = Builder.getDoubleTy();
+ break;
+ }
+
+ Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy);
+ auto *FPVTy = VectorType::get(
+ FPTy, cast<VectorType>(Vec->getType())->getElementCount());
+ Value *FPVec = Builder.CreateBitCast(Vec, FPVTy);
+ auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()},
+ {Pg, FPFallBack, FPVec});
+ Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType());
+ return IC.replaceInstUsesWith(II, FPIItoInt);
+}
+
static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
IntrinsicInst &II) {
LLVMContext &Ctx = II.getContext();
@@ -1294,6 +1338,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_lasta:
case Intrinsic::aarch64_sve_lastb:
return instCombineSVELast(IC, II);
+ case Intrinsic::aarch64_sve_clasta_n:
+ case Intrinsic::aarch64_sve_clastb_n:
+ return instCombineSVECondLast(IC, II);
case Intrinsic::aarch64_sve_cntd:
return instCombineSVECntElts(IC, II, 2);
case Intrinsic::aarch64_sve_cntw:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index d0aacb457a39..59ec91843266 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -334,8 +334,10 @@ public:
return 2;
}
- bool emitGetActiveLaneMask() const {
- return ST->hasSVE();
+ PredicationStyle emitGetActiveLaneMask() const {
+ if (ST->hasSVE())
+ return PredicationStyle::DataAndControlFlow;
+ return PredicationStyle::None;
}
bool supportsScalableVectors() const { return ST->hasSVE(); }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 89e1d85a6085..aaef363e9b8d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -21,6 +21,7 @@
#include "llvm/Analysis/ObjCARCUtil.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/LowLevelType.h"
@@ -354,7 +355,9 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
"Return value without a vreg");
bool Success = true;
- if (!VRegs.empty()) {
+ if (!FLI.CanLowerReturn) {
+ insertSRetStores(MIRBuilder, Val->getType(), VRegs, FLI.DemoteRegister);
+ } else if (!VRegs.empty()) {
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
@@ -464,6 +467,18 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
return Success;
}
+bool AArch64CallLowering::canLowerReturn(MachineFunction &MF,
+ CallingConv::ID CallConv,
+ SmallVectorImpl<BaseArgInfo> &Outs,
+ bool IsVarArg) const {
+ SmallVector<CCValAssign, 16> ArgLocs;
+ const auto &TLI = *getTLI<AArch64TargetLowering>();
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
+ MF.getFunction().getContext());
+
+ return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv));
+}
+
/// Helper function to compute forwarded registers for musttail calls. Computes
/// the forwarded registers, sets MBB liveness, and emits COPY instructions that
/// can be used to save + restore registers later.
@@ -533,6 +548,12 @@ bool AArch64CallLowering::lowerFormalArguments(
SmallVector<ArgInfo, 8> SplitArgs;
SmallVector<std::pair<Register, Register>> BoolArgs;
+
+ // Insert the hidden sret parameter if the return value won't fit in the
+ // return registers.
+ if (!FLI.CanLowerReturn)
+ insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL);
+
unsigned i = 0;
for (auto &Arg : F.args()) {
if (DL.getTypeStoreSize(Arg.getType()).isZero())
@@ -1194,7 +1215,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Finally we can copy the returned value back into its virtual-register. In
// symmetry with the arguments, the physical register must be an
// implicit-define of the call instruction.
- if (!Info.OrigRet.Ty->isVoidTy()) {
+ if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv);
CallReturnHandler Handler(MIRBuilder, MRI, MIB);
bool UsingReturnedArg =
@@ -1226,6 +1247,10 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
.addImm(Assigner.StackOffset)
.addImm(CalleePopBytes);
+ if (!Info.CanLowerReturn) {
+ insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
+ Info.DemoteRegister, Info.DemoteStackIndex);
+ }
return true;
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
index aafb1d19640a..cbdf77f69a63 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
@@ -35,6 +35,10 @@ public:
ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI,
Register SwiftErrorVReg) const override;
+ bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv,
+ SmallVectorImpl<BaseArgInfo> &Outs,
+ bool IsVarArg) const override;
+
bool fallBackToDAGISel(const MachineFunction &MF) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 9a65687735fe..eb8d0552173d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -1710,11 +1710,6 @@ bool AArch64InstructionSelector::selectCompareBranch(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
Register CondReg = I.getOperand(0).getReg();
MachineInstr *CCMI = MRI.getVRegDef(CondReg);
- if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) {
- CondReg = CCMI->getOperand(1).getReg();
- CCMI = MRI.getVRegDef(CondReg);
- }
-
// Try to select the G_BRCOND using whatever is feeding the condition if
// possible.
unsigned CCMIOpc = CCMI->getOpcode();
@@ -3346,12 +3341,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_SELECT: {
auto &Sel = cast<GSelect>(I);
- if (MRI.getType(Sel.getCondReg()) != LLT::scalar(1)) {
- LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
- << ", expected: " << LLT::scalar(1) << '\n');
- return false;
- }
-
const Register CondReg = Sel.getCondReg();
const Register TReg = Sel.getTrueReg();
const Register FReg = Sel.getFalseReg();
@@ -4777,12 +4766,6 @@ static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
return false;
MachineInstr *ValDef = MRI.getVRegDef(Val);
unsigned Opcode = ValDef->getOpcode();
- if (Opcode == TargetOpcode::G_TRUNC) {
- // Look through a trunc.
- Val = ValDef->getOperand(1).getReg();
- ValDef = MRI.getVRegDef(Val);
- Opcode = ValDef->getOpcode();
- }
if (isa<GAnyCmp>(ValDef)) {
CanNegate = true;
MustBeFirst = false;
@@ -4870,12 +4853,6 @@ MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
auto &MRI = *MIB.getMRI();
MachineInstr *ValDef = MRI.getVRegDef(Val);
unsigned Opcode = ValDef->getOpcode();
- if (Opcode == TargetOpcode::G_TRUNC) {
- // Look through a trunc.
- Val = ValDef->getOperand(1).getReg();
- ValDef = MRI.getVRegDef(Val);
- Opcode = ValDef->getOpcode();
- }
if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
Register LHS = Cmp->getLHSReg();
Register RHS = Cmp->getRHSReg();
@@ -5026,31 +5003,17 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
// First, check if the condition is defined by a compare.
MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
- while (CondDef) {
- // We can only fold if all of the defs have one use.
- Register CondDefReg = CondDef->getOperand(0).getReg();
- if (!MRI.hasOneNonDBGUse(CondDefReg)) {
- // Unless it's another select.
- for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
- if (CondDef == &UI)
- continue;
- if (UI.getOpcode() != TargetOpcode::G_SELECT)
- return false;
- }
- }
-
- // We can skip over G_TRUNC since the condition is 1-bit.
- // Truncating/extending can have no impact on the value.
- unsigned Opc = CondDef->getOpcode();
- if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC)
- break;
-
- // Can't see past copies from physregs.
- if (Opc == TargetOpcode::COPY &&
- Register::isPhysicalRegister(CondDef->getOperand(1).getReg()))
- return false;
- CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg());
+ // We can only fold if all of the defs have one use.
+ Register CondDefReg = CondDef->getOperand(0).getReg();
+ if (!MRI.hasOneNonDBGUse(CondDefReg)) {
+ // Unless it's another select.
+ for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
+ if (CondDef == &UI)
+ continue;
+ if (UI.getOpcode() != TargetOpcode::G_SELECT)
+ return false;
+ }
}
// Is the condition defined by a compare?
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 74ec9373ce9e..d3617b87a851 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -42,7 +42,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
: ST(&ST) {
using namespace TargetOpcode;
const LLT p0 = LLT::pointer(0, 64);
- const LLT s1 = LLT::scalar(1);
const LLT s8 = LLT::scalar(8);
const LLT s16 = LLT::scalar(16);
const LLT s32 = LLT::scalar(32);
@@ -80,7 +79,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
const LLT &MinFPScalar = HasFP16 ? s16 : s32;
getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
- .legalFor({p0, s1, s8, s16, s32, s64})
+ .legalFor({p0, s8, s16, s32, s64})
.legalFor(PackedVectorAllTypeList)
.widenScalarToNextPow2(0)
.clampScalar(0, s8, s64)
@@ -198,8 +197,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(
{G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
- .legalFor({{s32, s1}, {s64, s1}})
+ .legalFor({{s32, s32}, {s64, s32}})
.clampScalar(0, s32, s64)
+ .clampScalar(1, s32, s64)
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
@@ -241,7 +241,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(G_INSERT)
.legalIf(all(typeInSet(0, {s32, s64, p0}),
- typeInSet(1, {s1, s8, s16, s32}), smallerThan(1, 0)))
+ typeInSet(1, {s8, s16, s32}), smallerThan(1, 0)))
.widenScalarToNextPow2(0)
.clampScalar(0, s32, s64)
.widenScalarToNextPow2(1)
@@ -260,8 +260,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
.maxScalarIf(typeInSet(1, {s128}), 0, s64);
- getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
- .lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
+
+ for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
+ auto &Actions = getActionDefinitionsBuilder(Op);
+
+ if (Op == G_SEXTLOAD)
+ Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered));
+
+ // Atomics have zero extending behavior.
+ Actions
.legalForTypesWithMemDesc({{s32, p0, s8, 8},
{s32, p0, s16, 8},
{s32, p0, s32, 8},
@@ -278,6 +285,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.unsupportedIfMemSizeNotPow2()
// Lower anything left over into G_*EXT and G_LOAD
.lower();
+ }
auto IsPtrVecPred = [=](const LegalityQuery &Query) {
const LLT &ValTy = Query.Types[0];
@@ -425,10 +433,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
const LLT &SrcTy = Query.Types[1];
- // Special case for s1.
- if (SrcTy == s1)
- return true;
-
// Make sure we fit in a register otherwise. Don't bother checking that
// the source type is below 128 bits. We shouldn't be allowing anything
// through which is wider than the destination in the first place.
@@ -481,13 +485,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.widenScalarToNextPow2(0);
// Control-flow
- getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32});
+ getActionDefinitionsBuilder(G_BRCOND)
+ .legalFor({s32})
+ .clampScalar(0, s32, s32);
getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
getActionDefinitionsBuilder(G_SELECT)
- .legalFor({{s32, s1}, {s64, s1}, {p0, s1}})
+ .legalFor({{s32, s32}, {s64, s32}, {p0, s32}})
.widenScalarToNextPow2(0)
.clampScalar(0, s32, s64)
+ .clampScalar(1, s32, s32)
.minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
.lowerIf(isVector(0));
@@ -500,7 +507,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
getActionDefinitionsBuilder(G_PTRTOINT)
- .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
+ .legalForCartesianProduct({s8, s16, s32, s64}, {p0})
.legalFor({{v2s64, v2p0}})
.maxScalar(0, s64)
.widenScalarToNextPow2(0, /*Min*/ 8);
@@ -517,7 +524,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
// FIXME: This is wrong since G_BITCAST is not allowed to change the
// number of bits but it's what the previous code described and fixing
// it breaks tests.
- .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8,
+ .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8,
v8s16, v4s16, v2s16, v4s32, v2s32, v2s64,
v2p0});
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 2901e5c0fe4d..bd0a497fa441 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -43,11 +43,9 @@ namespace {
class AArch64MCCodeEmitter : public MCCodeEmitter {
MCContext &Ctx;
- const MCInstrInfo &MCII;
public:
- AArch64MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
- : Ctx(ctx), MCII(mcii) {}
+ AArch64MCCodeEmitter(const MCInstrInfo &, MCContext &ctx) : Ctx(ctx) {}
AArch64MCCodeEmitter(const AArch64MCCodeEmitter &) = delete;
void operator=(const AArch64MCCodeEmitter &) = delete;
~AArch64MCCodeEmitter() override = default;
@@ -193,12 +191,6 @@ public:
uint32_t encodeMatrixIndexGPR32(const MCInst &MI, unsigned OpIdx,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
-
-private:
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
};
} // end anonymous namespace
@@ -618,9 +610,6 @@ unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
- verifyInstructionPredicates(MI,
- computeAvailableFeatures(STI.getFeatureBits()));
-
if (MI.getOpcode() == AArch64::TLSDESCCALL) {
// This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
// following (BLR) instruction. It doesn't emit any code itself so it
@@ -674,7 +663,6 @@ unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
return EncodedValue;
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "AArch64GenMCCodeEmitter.inc"
MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 34e3b2cf58e4..f129bfe11e4d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -34,6 +34,7 @@ using namespace llvm;
#define GET_INSTRINFO_MC_DESC
#define GET_INSTRINFO_MC_HELPERS
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "AArch64GenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 049c49796dc6..7d1de3e53c0c 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -33,6 +33,7 @@ class MCSubtargetInfo;
class MCTargetOptions;
class MCTargetStreamer;
class Target;
+class FeatureBitset;
MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
MCContext &Ctx);
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 2744e81f99f1..cb36aa26e839 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -227,6 +227,40 @@ class sme_add_vector_to_tile_u64<bit V, string mnemonic>
let Inst{2-0} = ZAda;
}
+class sme_add_vector_to_tile_pseudo<ZPRRegOp zpr_ty>
+ : Pseudo<(outs),
+ (ins i64imm:$tile, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn), []>,
+ Sched<[]> {
+ // Translated to the actual instructions in AArch64ISelLowering.cpp
+ let usesCustomInserter = 1;
+}
+
+def ADDHA_MPPZ_PSEUDO_S : sme_add_vector_to_tile_pseudo<ZPR32>;
+def ADDVA_MPPZ_PSEUDO_S : sme_add_vector_to_tile_pseudo<ZPR32>;
+
+def : Pat<(int_aarch64_sme_addha
+ imm0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm),
+ (nxv4i32 ZPR32:$zn)),
+ (ADDHA_MPPZ_PSEUDO_S imm0_3:$tile, $pn, $pm, $zn)>;
+def : Pat<(int_aarch64_sme_addva
+ imm0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm),
+ (nxv4i32 ZPR32:$zn)),
+ (ADDVA_MPPZ_PSEUDO_S imm0_3:$tile, $pn, $pm, $zn)>;
+
+let Predicates = [HasSMEI64] in {
+def ADDHA_MPPZ_PSEUDO_D : sme_add_vector_to_tile_pseudo<ZPR64>;
+def ADDVA_MPPZ_PSEUDO_D : sme_add_vector_to_tile_pseudo<ZPR64>;
+
+def : Pat<(int_aarch64_sme_addha
+ imm0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm),
+ (nxv2i64 ZPR64:$zn)),
+ (ADDHA_MPPZ_PSEUDO_D imm0_7:$tile, $pn, $pm, $zn)>;
+def : Pat<(int_aarch64_sme_addva
+ imm0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm),
+ (nxv2i64 ZPR64:$zn)),
+ (ADDVA_MPPZ_PSEUDO_D imm0_7:$tile, $pn, $pm, $zn)>;
+}
+
//===----------------------------------------------------------------------===//
// SME Contiguous Loads
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 3631536a32b9..7cdd4c4af95e 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -650,11 +650,11 @@ multiclass sve_int_pfalse<bits<6> opc, string asm> {
def : Pat<(nxv1i1 immAllZerosV), (!cast<Instruction>(NAME))>;
}
-class sve_int_ptest<bits<6> opc, string asm>
+class sve_int_ptest<bits<6> opc, string asm, SDPatternOperator op>
: I<(outs), (ins PPRAny:$Pg, PPR8:$Pn),
asm, "\t$Pg, $Pn",
"",
- []>, Sched<[]> {
+ [(op (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn))]>, Sched<[]> {
bits<4> Pg;
bits<4> Pn;
let Inst{31-24} = 0b00100101;
@@ -1691,6 +1691,9 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op,
!cast<Instruction>(NAME), PTRUE_S>;
def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2i1, nxv2i1,
!cast<Instruction>(NAME), PTRUE_D>;
+ // Emulate .Q operation using a PTRUE_D when the other lanes don't matter.
+ def : SVE_2_Op_AllActive_Pat<nxv1i1, op_nopred, nxv1i1, nxv1i1,
+ !cast<Instruction>(NAME), PTRUE_D>;
}
// An instance of sve_int_pred_log_and but uses op_nopred's first operand as the
@@ -1706,6 +1709,9 @@ multiclass sve_int_pred_log_v2<bits<4> opc, string asm, SDPatternOperator op,
(!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>;
def : Pat<(nxv2i1 (op_nopred nxv2i1:$Op1, nxv2i1:$Op2)),
(!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>;
+ // Emulate .Q operation using a PTRUE_D when the other lanes don't matter.
+ def : Pat<(nxv1i1 (op_nopred nxv1i1:$Op1, nxv1i1:$Op2)),
+ (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 71303611265c..cf8891cff1b3 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -343,7 +343,8 @@ struct SysAlias {
: Name(N), Encoding(E), FeaturesRequired(F) {}
bool haveFeatures(FeatureBitset ActiveFeatures) const {
- return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+ return ActiveFeatures[llvm::AArch64::FeatureAll] ||
+ (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
}
FeatureBitset getRequiredFeatures() const { return FeaturesRequired; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index c4680cbedadf..91dc611fb265 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -317,6 +317,9 @@ extern char &SIFormMemoryClausesID;
void initializeSIPostRABundlerPass(PassRegistry&);
extern char &SIPostRABundlerID;
+void initializeGCNCreateVOPDPass(PassRegistry &);
+extern char &GCNCreateVOPDID;
+
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
extern char &AMDGPUUnifyDivergentExitNodesID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 94d7844e8a32..a8108b1d637b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -626,13 +626,13 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
Constant *FoldedT = SelOpNo ?
ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
- if (isa<ConstantExpr>(FoldedT))
+ if (!FoldedT || isa<ConstantExpr>(FoldedT))
return false;
Constant *FoldedF = SelOpNo ?
ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
- if (isa<ConstantExpr>(FoldedF))
+ if (!FoldedF || isa<ConstantExpr>(FoldedF))
return false;
IRBuilder<> Builder(&BO);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b00df27f5fd3..589992c7a7ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1883,20 +1883,24 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return true;
}
+// Match an immediate (if Imm is true) or an SGPR (if Imm is false)
+// offset. If Imm32Only is true, match only 32-bit immediate offsets
+// available on CI.
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
- SDValue &Offset, bool &Imm) const {
+ SDValue &Offset, bool Imm,
+ bool Imm32Only) const {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
if (!C) {
+ if (Imm)
+ return false;
if (ByteOffsetNode.getValueType().isScalarInteger() &&
ByteOffsetNode.getValueType().getSizeInBits() == 32) {
Offset = ByteOffsetNode;
- Imm = false;
return true;
}
if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
Offset = ByteOffsetNode.getOperand(0);
- Imm = false;
return true;
}
}
@@ -1908,9 +1912,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
int64_t ByteOffset = C->getSExtValue();
Optional<int64_t> EncodedOffset =
AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
- if (EncodedOffset) {
+ if (EncodedOffset && Imm && !Imm32Only) {
Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
- Imm = true;
return true;
}
@@ -1919,7 +1922,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
return false;
EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
- if (EncodedOffset) {
+ if (EncodedOffset && Imm32Only) {
Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
return true;
}
@@ -1927,11 +1930,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
return false;
- SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
- Offset = SDValue(
- CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
+ if (!Imm) {
+ SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
+ Offset = SDValue(
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
+ return true;
+ }
- return true;
+ return false;
}
SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
@@ -1959,8 +1965,12 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
Ops), 0);
}
+// Match a base and an immediate (if Imm is true) or an SGPR
+// (if Imm is false) offset. If Imm32Only is true, match only 32-bit
+// immediate offsets available on CI.
bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
- SDValue &Offset, bool &Imm) const {
+ SDValue &Offset, bool Imm,
+ bool Imm32Only) const {
SDLoc SL(Addr);
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -1977,41 +1987,34 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
assert(N0 && N1 && isa<ConstantSDNode>(N1));
}
if (N0 && N1) {
- if (SelectSMRDOffset(N1, Offset, Imm)) {
+ if (SelectSMRDOffset(N1, Offset, Imm, Imm32Only)) {
SBase = Expand32BitAddress(N0);
return true;
}
}
+ return false;
}
+ if (!Imm)
+ return false;
SBase = Expand32BitAddress(Addr);
Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
- Imm = true;
return true;
}
bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- bool Imm = false;
- return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
+ return SelectSMRD(Addr, SBase, Offset, /* Imm */ true);
}
bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
-
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
-
- bool Imm = false;
- if (!SelectSMRD(Addr, SBase, Offset, Imm))
- return false;
-
- return !Imm && isa<ConstantSDNode>(Offset);
+ return SelectSMRD(Addr, SBase, Offset, /* Imm */ true, /* Imm32Only */ true);
}
bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- bool Imm = false;
- return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
- !isa<ConstantSDNode>(Offset);
+ return SelectSMRD(Addr, SBase, Offset, /* Imm */ false);
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 862be9dc5568..7894b8eb5b67 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -193,11 +193,11 @@ private:
bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &SAddr, SDValue &Offset) const;
- bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
- bool &Imm) const;
+ bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool Imm,
+ bool Imm32Only) const;
SDValue Expand32BitAddress(SDValue Addr) const;
- bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
- bool &Imm) const;
+ bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool Imm,
+ bool Imm32Only = false) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ef7929012597..bf520a560404 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4803,6 +4803,8 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::Nand:
case AtomicRMWInst::FAdd:
case AtomicRMWInst::FSub:
+ case AtomicRMWInst::FMax:
+ case AtomicRMWInst::FMin:
return AtomicExpansionKind::CmpXChg;
default:
return AtomicExpansionKind::None;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3f242fdb6d8e..70fae9d784a2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1180,7 +1180,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
if (Arg) {
- const int64_t Value = Arg.getValue().Value.getSExtValue();
+ const int64_t Value = Arg.value().Value.getSExtValue();
if (Value == 0) {
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
@@ -3235,7 +3235,7 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
// Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
- return false;
+ return Register();
if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
return Def->getOperand(1).getReg();
@@ -3851,27 +3851,36 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
getAddrModeInfo(*MI, *MRI, AddrInfo);
// FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
- // then we can select all ptr + 32-bit offsets not just immediate offsets.
- if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
+ // then we can select all ptr + 32-bit offsets.
+ if (AddrInfo.empty())
return None;
const GEPInfo &GEPInfo = AddrInfo[0];
+ Register PtrReg = GEPInfo.SgprParts[0];
+
// SGPR offset is unsigned.
- if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
- return None;
+ if (AddrInfo[0].SgprParts.size() == 1 && isUInt<32>(GEPInfo.Imm) &&
+ GEPInfo.Imm != 0) {
+ // If we make it this far we have a load with an 32-bit immediate offset.
+ // It is OK to select this using a sgpr offset, because we have already
+ // failed trying to select this load into one of the _IMM variants since
+ // the _IMM Patterns are considered before the _SGPR patterns.
+ Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addImm(GEPInfo.Imm);
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}};
+ }
- // If we make it this far we have a load with an 32-bit immediate offset.
- // It is OK to select this using a sgpr offset, because we have already
- // failed trying to select this load into one of the _IMM variants since
- // the _IMM Patterns are considered before the _SGPR patterns.
- Register PtrReg = GEPInfo.SgprParts[0];
- Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
- .addImm(GEPInfo.Imm);
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
- [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
- }};
+ if (AddrInfo[0].SgprParts.size() == 2 && GEPInfo.Imm == 0) {
+ if (Register OffsetReg =
+ matchZeroExtendFromS32(*MRI, GEPInfo.SgprParts[1])) {
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}};
+ }
+ }
+
+ return None;
}
std::pair<Register, int>
@@ -4231,7 +4240,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
},
[=](MachineInstrBuilder &MIB) { // vaddr
if (FI)
- MIB.addFrameIndex(FI.getValue());
+ MIB.addFrameIndex(FI.value());
else
MIB.addReg(VAddr);
},
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 31012915457b..26e6b9a10688 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -542,63 +542,37 @@ def atomic_store_64_#as : PatFrag<(ops node:$ptr, node:$val),
}
} // End foreach as
-// TODO: Add GISelPredicateCode for the ret and noret PatFrags once
-// GlobalISelEmitter allows pattern matches where src and dst def count
-// mismatch.
-
-multiclass ret_noret_op {
- let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
- GISelPredicateCode = [{ return true; }] in {
- def "_ret" : PatFrag<(ops node:$ptr, node:$data),
- (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
- }
-
- let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
- GISelPredicateCode = [{ return false; }] in {
- def "_noret" : PatFrag<(ops node:$ptr, node:$data),
- (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
- }
+multiclass noret_op {
+ let HasNoUse = true in
+ def "_noret" : PatFrag<(ops node:$ptr, node:$data),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
}
-defm int_amdgcn_flat_atomic_fadd : ret_noret_op;
-defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op;
-defm int_amdgcn_flat_atomic_fmin : ret_noret_op;
-defm int_amdgcn_flat_atomic_fmax : ret_noret_op;
-defm int_amdgcn_global_atomic_fadd : ret_noret_op;
-defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op;
-defm int_amdgcn_global_atomic_fmin : ret_noret_op;
-defm int_amdgcn_global_atomic_fmax : ret_noret_op;
-defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op;
-
-multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
- let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
- GISelPredicateCode = [{ return false; }] in {
- defm "_noret" : binary_atomic_op<atomic_op, IsInt>;
- }
+defm int_amdgcn_flat_atomic_fadd : noret_op;
+defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op;
+defm int_amdgcn_flat_atomic_fmin : noret_op;
+defm int_amdgcn_flat_atomic_fmax : noret_op;
+defm int_amdgcn_global_atomic_fadd : noret_op;
+defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op;
+defm int_amdgcn_global_atomic_fmin : noret_op;
+defm int_amdgcn_global_atomic_fmax : noret_op;
+defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
- let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
- GISelPredicateCode = [{ return true; }] in {
- defm "_ret" : binary_atomic_op<atomic_op, IsInt>;
- }
+multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
+ let HasNoUse = true in
+ defm "_noret" : binary_atomic_op<atomic_op, IsInt>;
}
-multiclass ret_noret_ternary_atomic_op<SDNode atomic_op> {
- let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
- GISelPredicateCode = [{ return false; }] in {
- defm "_noret" : ternary_atomic_op<atomic_op>;
- }
-
- let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
- GISelPredicateCode = [{ return true; }] in {
- defm "_ret" : ternary_atomic_op<atomic_op>;
- }
+multiclass noret_ternary_atomic_op<SDNode atomic_op> {
+ let HasNoUse = true in
+ defm "_noret" : ternary_atomic_op<atomic_op>;
}
multiclass binary_atomic_op_all_as<SDNode atomic_op, bit IsInt = 1> {
foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
defm "_"#as : binary_atomic_op<atomic_op, IsInt>;
- defm "_"#as : ret_noret_binary_atomic_op<atomic_op, IsInt>;
+ defm "_"#as : noret_binary_atomic_op<atomic_op, IsInt>;
}
}
}
@@ -640,13 +614,15 @@ def store_align16_local: PatFrag<(ops node:$val, node:$ptr),
let AddressSpaces = StoreAddress_local.AddrSpaces in {
defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>;
-defm atomic_cmp_swap_local : ret_noret_ternary_atomic_op<atomic_cmp_swap>;
-defm atomic_cmp_swap_local_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>;
+defm atomic_cmp_swap_local : noret_ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_local_m0 : noret_ternary_atomic_op<atomic_cmp_swap_glue>;
+defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
}
let AddressSpaces = StoreAddress_region.AddrSpaces in {
-defm atomic_cmp_swap_region : ret_noret_ternary_atomic_op<atomic_cmp_swap>;
-defm atomic_cmp_swap_region_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>;
+defm atomic_cmp_swap_region : noret_ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_region_m0 : noret_ternary_atomic_op<atomic_cmp_swap_glue>;
+defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index ed6ddbf426fd..38e04dedd9fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -171,6 +171,10 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
}
void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ // FIXME: Enable feature predicate checks once all the test pass.
+ // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
+ // getSubtargetInfo().getFeatureBits());
+
if (emitPseudoExpansionLowering(*OutStreamer, MI))
return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 1b513c456307..745734aac2b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -131,8 +131,8 @@ public:
bool IsAOneAddressSpace = isOneAddressSpace(A);
bool IsBOneAddressSpace = isOneAddressSpace(B);
- return AIO.getValue() >= BIO.getValue() &&
- (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace);
+ return AIO.value() >= BIO.value() &&
+ (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace);
}
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 77816a783630..6bd906439ee8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -40,9 +40,9 @@ using namespace llvm;
#include "AMDGPUGenSubtargetInfo.inc"
#undef AMDGPUSubtarget
-static cl::opt<bool> DisablePowerSched(
- "amdgpu-disable-power-sched",
- cl::desc("Disable scheduling to minimize mAI power bursts"),
+static cl::opt<bool> EnablePowerSched(
+ "amdgpu-enable-power-sched",
+ cl::desc("Enable scheduling to minimize mAI power bursts"),
cl::init(false));
static cl::opt<bool> EnableVGPRIndexMode(
@@ -916,7 +916,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
void apply(ScheduleDAGInstrs *DAGInstrs) override {
const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasMAIInsts() || DisablePowerSched)
+ if (!ST.hasMAIInsts())
return;
DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
@@ -966,7 +966,8 @@ void GCNSubtarget::getPostRAMutations(
std::unique_ptr<ScheduleDAGMutation>
GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
- return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
+ return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
+ : nullptr;
}
const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 1c6b9d35695a..971e44723758 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -22,11 +22,13 @@
#include "AMDGPUTargetTransformInfo.h"
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
+#include "GCNVOPDUtils.h"
#include "R600.h"
#include "R600TargetMachine.h"
#include "SIMachineFunctionInfo.h"
#include "SIMachineScheduler.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CGSCCPassManager.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
@@ -278,6 +280,12 @@ static cl::opt<bool>
cl::desc("Enable s_delay_alu insertion"),
cl::init(true), cl::Hidden);
+// Enable GFX11+ VOPD
+static cl::opt<bool>
+ EnableVOPD("amdgpu-enable-vopd",
+ cl::desc("Enable VOPD, dual issue of VALU in wave32"),
+ cl::init(true), cl::Hidden);
+
// Option is used in lit tests to prevent deadcoding of patterns inspected.
static cl::opt<bool>
EnableDCEInRA("amdgpu-dce-in-ra",
@@ -383,6 +391,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIPreAllocateWWMRegsPass(*PR);
initializeSIFormMemoryClausesPass(*PR);
initializeSIPostRABundlerPass(*PR);
+ initializeGCNCreateVOPDPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUExternalAAWrapperPass(*PR);
@@ -920,6 +929,8 @@ public:
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
DAG->addMutation(createIGroupLPDAGMutation());
DAG->addMutation(createSchedBarrierDAGMutation());
+ if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
+ DAG->addMutation(createVOPDPairingMutation());
return DAG;
}
@@ -1399,6 +1410,8 @@ void GCNPassConfig::addPreSched2() {
}
void GCNPassConfig::addPreEmitPass() {
+ if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
+ addPass(&GCNCreateVOPDID);
addPass(createSIMemoryLegalizerPass());
addPass(createSIInsertWaitcntsPass());
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index a087323e5de7..04dd3e938a15 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1412,10 +1412,12 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> {
foreach RtnMode = ["ret", "noret"] in {
- defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode
+ defvar Op = !cast<SDPatternOperator>(OpPrefix
+ # !if(!eq(RtnMode, "ret"), "", "_noret")
# !if(isIntr, "", "_" # vt.Size));
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+ let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
def : GCNPat<
(vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), vt:$vdata_in)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in,
@@ -1428,6 +1430,7 @@ multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isInt
(!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in,
VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset)
>;
+ } // end let AddedComplexity
} // end foreach RtnMode
}
@@ -1439,10 +1442,12 @@ multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> {
multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> {
foreach RtnMode = ["ret", "noret"] in {
- defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global_" # RtnMode
+ defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global"
+ # !if(!eq(RtnMode, "ret"), "", "_noret")
# "_" # vt.Size);
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+ let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset,
offset:$offset);
@@ -1465,6 +1470,7 @@ multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst>
!if(!eq(vt, i32), sub0, sub0_sub1)),
Addr64ResDag)
>;
+ } // end let AddedComplexity
} // end foreach RtnMode
}
@@ -1495,13 +1501,14 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
list<string> RtnModes = ["ret", "noret"]> {
foreach RtnMode = RtnModes in {
- defvar Op = !cast<SDPatternOperator>(!if(!eq(RtnMode, "none"),
- OpPrefix, OpPrefix # "_" # RtnMode));
- defvar InstSuffix = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")),
- "_RTN", "");
- defvar CachePolicy = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")),
+ defvar Op = !cast<SDPatternOperator>(OpPrefix
+ # !if(!eq(RtnMode, "ret"), "", "_noret"));
+
+ defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+ defvar CachePolicy = !if(!eq(RtnMode, "ret"),
(set_glc $cachepolicy), (timm:$cachepolicy));
+ let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset,
timm:$offset, timm:$cachepolicy, 0)),
@@ -1534,6 +1541,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy)
>;
+ } // end let AddedComplexity
} // end foreach RtnMode
}
@@ -1551,7 +1559,7 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_or", i32, "BUFFER_ATOMIC_OR">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i32, "BUFFER_ATOMIC_XOR">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i32, "BUFFER_ATOMIC_INC">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i32, "BUFFER_ATOMIC_DEC">;
-defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["none"]>;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["ret"]>;
defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i64, "BUFFER_ATOMIC_SWAP_X2">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i64, "BUFFER_ATOMIC_ADD_X2">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_sub", i64, "BUFFER_ATOMIC_SUB_X2">;
@@ -1643,7 +1651,8 @@ let SubtargetPredicate = isGFX90APlus in {
foreach RtnMode = ["ret", "noret"] in {
-defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap # "_" # RtnMode);
+defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap
+ # !if(!eq(RtnMode, "ret"), "", "_noret"));
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy),
(timm:$cachepolicy));
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 27b723875aa4..d8387bf6f1ae 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -950,10 +950,11 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">;
} // End AddedComplexity = 100
-class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
- (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))
->;
+class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0,
+ bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
+ (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> {
+ let AddedComplexity = complexity;
+}
multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
@@ -965,75 +966,88 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
!cast<PatFrag>(frag#"_local_"#vt.Size)>;
}
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>;
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ /* complexity */ 0, /* gds */ 1>;
}
multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), /* complexity */ 1>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt.Size)>;
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>;
}
def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>;
+ !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ /* complexity */ 0, /* gds */ 1>;
def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>;
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ /* complexity */ 1, /* gds */ 1>;
}
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
-class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
+class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag,
+ int complexity = 0, bit gds=0> : GCNPat<
(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
- (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))
->;
+ (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))> {
+ let AddedComplexity = complexity;
+}
multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt,
string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>;
- def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>;
+ def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
+ def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size),
+ /* complexity */ 1>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt.Size)>;
def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_noret_"#vt.Size),
+ /* complexity */ 1>;
}
- def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>;
- def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>;
+ def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ /* complexity */ 0, /* gds */ 1>;
+ def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ /* complexity */ 1, /* gds */ 1>;
}
} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10
let SubtargetPredicate = isGFX11Plus in {
// The order of src and cmp agrees with the BUFFER_ATOMIC_CMPSWAP opcode.
-class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
+class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag,
+ int complexity = 0, bit gds=0> : GCNPat<
(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
- (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds))
->;
+ (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds))> {
+ let AddedComplexity = complexity;
+}
multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> {
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt.Size)>;
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>;
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>;
- def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>;
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ /* complexity */ 0, /* gds */ 1>;
+ def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ /* complexity */ 1, /* gds */ 1>;
}
} // End SubtargetPredicate = isGFX11Plus
@@ -1090,17 +1104,20 @@ defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B64, DS_CMPSTORE_B64, i64, "atomic_cmp
} // End SubtargetPredicate = isGFX11Plus
let SubtargetPredicate = isGFX90APlus in {
-def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_ret_64>;
+def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>;
+let AddedComplexity = 1 in
def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
}
let SubtargetPredicate = isGFX940Plus in {
-def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_ret_32>;
+def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>;
+let AddedComplexity = 1 in
def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>;
def : GCNPat <
- (v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)),
+ (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
(DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
>;
+let AddedComplexity = 1 in
def : GCNPat <
(v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
(DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index cb2822818549..c634e15945ad 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1015,31 +1015,35 @@ class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt
multiclass FlatAtomicPat <string inst, string node, ValueType vt,
ValueType data_vt = vt> {
- defvar rtnNode = !cast<PatFrags>(node#"_ret_"#vt.Size);
+ defvar rtnNode = !cast<PatFrags>(node#"_"#vt.Size);
defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);
def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+ let AddedComplexity = 1 in
def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
- ValueType data_vt = vt, bit isIntr = 0> {
- defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
+ ValueType data_vt = vt, int complexity = 0,
+ bit isIntr = 0> {
+ defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size));
defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
+ let AddedComplexity = complexity in
def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+ let AddedComplexity = !add(complexity, 1) in
def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
multiclass FlatSignedAtomicIntrPat <string inst, string node, ValueType vt,
ValueType data_vt = vt> {
- defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+ defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* complexity */ 0, /* isIntr */ 1>;
}
class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1260,17 +1264,16 @@ multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator nod
multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> {
- defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
+ defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size));
defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
- let AddedComplexity = 10 in {
- defm : FlatSignedAtomicPat <inst, node, vt, data_vt, isIntr>;
- }
+ defm : FlatSignedAtomicPat <inst, node, vt, data_vt, /* complexity */ 10, isIntr>;
- let AddedComplexity = 11 in {
- def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>;
- def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>;
- }
+ let AddedComplexity = 13 in
+ def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>;
+
+ let AddedComplexity = 12 in
+ def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>;
}
multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt,
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
new file mode 100644
index 000000000000..83dc3bebf4d3
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -0,0 +1,175 @@
+//===- GCNCreateVOPD.cpp - Create VOPD Instructions ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Combine VALU pairs into VOPD instructions
+/// Only works on wave32
+/// Has register requirements, we reject creating VOPD if the requirements are
+/// not met.
+/// shouldCombineVOPD mutator in postRA machine scheduler puts candidate
+/// instructions for VOPD back-to-back
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "GCNVOPDUtils.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include <utility>
+
+#define DEBUG_TYPE "gcn-create-vopd"
+STATISTIC(NumVOPDCreated, "Number of VOPD Insts Created.");
+
+using namespace llvm;
+
+namespace {
+
+class GCNCreateVOPD : public MachineFunctionPass {
+private:
+public:
+ static char ID;
+ const GCNSubtarget *ST = nullptr;
+
+ GCNCreateVOPD() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override {
+ return "GCN Create VOPD Instructions";
+ }
+
+ bool doReplace(const SIInstrInfo *SII,
+ std::pair<MachineInstr *, MachineInstr *> &Pair) {
+ auto *FirstMI = Pair.first;
+ auto *SecondMI = Pair.second;
+ unsigned Opc1 = FirstMI->getOpcode();
+ unsigned Opc2 = SecondMI->getOpcode();
+ int NewOpcode = AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1),
+ AMDGPU::getVOPDOpcode(Opc2));
+ assert(NewOpcode != -1 &&
+ "Should have previously determined this as a possible VOPD\n");
+
+ auto VOPDInst = BuildMI(*FirstMI->getParent(), FirstMI,
+ FirstMI->getDebugLoc(), SII->get(NewOpcode))
+ .setMIFlags(FirstMI->getFlags() | SecondMI->getFlags());
+ VOPDInst.add(FirstMI->getOperand(0))
+ .add(SecondMI->getOperand(0))
+ .add(FirstMI->getOperand(1));
+
+ switch (Opc1) {
+ case AMDGPU::V_MOV_B32_e32:
+ break;
+ case AMDGPU::V_FMAMK_F32:
+ case AMDGPU::V_FMAAK_F32:
+ VOPDInst.add(FirstMI->getOperand(2));
+ VOPDInst.add(FirstMI->getOperand(3));
+ break;
+ default:
+ VOPDInst.add(FirstMI->getOperand(2));
+ break;
+ }
+
+ VOPDInst.add(SecondMI->getOperand(1));
+
+ switch (Opc2) {
+ case AMDGPU::V_MOV_B32_e32:
+ break;
+ case AMDGPU::V_FMAMK_F32:
+ case AMDGPU::V_FMAAK_F32:
+ VOPDInst.add(SecondMI->getOperand(2));
+ VOPDInst.add(SecondMI->getOperand(3));
+ break;
+ default:
+ VOPDInst.add(SecondMI->getOperand(2));
+ break;
+ }
+
+ VOPDInst.copyImplicitOps(*FirstMI);
+ VOPDInst.copyImplicitOps(*SecondMI);
+
+ LLVM_DEBUG(dbgs() << "VOPD Fused: " << *VOPDInst << " from\tX: "
+ << *Pair.first << "\tY: " << *Pair.second << "\n");
+ FirstMI->eraseFromParent();
+ SecondMI->eraseFromParent();
+ ++NumVOPDCreated;
+ return true;
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32())
+ return false;
+ LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n");
+
+ const SIInstrInfo *SII = ST->getInstrInfo();
+ bool Changed = false;
+
+ SmallVector<std::pair<MachineInstr *, MachineInstr *>> ReplaceCandidates;
+
+ for (auto &MBB : MF) {
+ auto MII = MBB.begin(), E = MBB.end();
+ while (MII != E) {
+ auto *FirstMI = &*MII;
+ MII = next_nodbg(MII, MBB.end());
+ if (MII == MBB.end())
+ break;
+ if (FirstMI->isDebugInstr())
+ continue;
+ auto *SecondMI = &*MII;
+ unsigned Opc = FirstMI->getOpcode();
+ unsigned Opc2 = SecondMI->getOpcode();
+ llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc);
+ llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2);
+ std::pair<MachineInstr *, MachineInstr *> Pair;
+
+ if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y)
+ Pair = {FirstMI, SecondMI};
+ else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)
+ Pair = {SecondMI, FirstMI};
+ else
+ continue;
+ // checkVOPDRegConstraints cares about program order, but doReplace
+ // cares about X-Y order in the constituted VOPD
+ if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) {
+ ReplaceCandidates.push_back(Pair);
+ ++MII;
+ }
+ }
+ }
+ for (auto &Pair : ReplaceCandidates) {
+ Changed |= doReplace(SII, Pair);
+ }
+
+ return Changed;
+ }
+};
+
+} // namespace
+
+char GCNCreateVOPD::ID = 0;
+
+char &llvm::GCNCreateVOPDID = GCNCreateVOPD::ID;
+
+INITIALIZE_PASS(GCNCreateVOPD, DEBUG_TYPE, "GCN Create VOPD Instructions",
+ false, false)
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 1cd880eaa48e..5d254518c67a 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -143,13 +143,20 @@ bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
}
int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const {
- auto DPP32 = AMDGPU::getDPPOp32(Op);
+ int DPP32 = AMDGPU::getDPPOp32(Op);
if (IsShrinkable) {
assert(DPP32 == -1);
- auto E32 = AMDGPU::getVOPe32(Op);
+ int E32 = AMDGPU::getVOPe32(Op);
DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32);
}
- return (DPP32 == -1 || TII->pseudoToMCOpcode(DPP32) == -1) ? -1 : DPP32;
+ if (DPP32 != -1 && TII->pseudoToMCOpcode(DPP32) != -1)
+ return DPP32;
+ int DPP64 = -1;
+ if (ST->hasVOP3DPP())
+ DPP64 = AMDGPU::getDPPOp64(Op);
+ if (DPP64 != -1 && TII->pseudoToMCOpcode(DPP64) != -1)
+ return DPP64;
+ return -1;
}
// tracks the register operand definition and returns:
@@ -188,6 +195,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
+ bool HasVOP3DPP = ST->hasVOP3DPP();
auto OrigOp = OrigMI.getOpcode();
auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
if (DPPOp == -1) {
@@ -201,10 +209,18 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
bool Fail = false;
do {
- auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
- assert(Dst);
- DPPInst.add(*Dst);
- int NumOperands = 1;
+ int NumOperands = 0;
+ if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) {
+ DPPInst.add(*Dst);
+ ++NumOperands;
+ }
+ if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) {
+ if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) {
+ DPPInst.add(*SDst);
+ ++NumOperands;
+ }
+ // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst
+ }
const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
if (OldIdx != -1) {
@@ -230,7 +246,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
AMDGPU::OpName::src0_modifiers)) {
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
AMDGPU::OpName::src0_modifiers));
- assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
+ assert(HasVOP3DPP ||
+ (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
DPPInst.addImm(Mod0->getImm());
++NumOperands;
} else if (AMDGPU::getNamedOperandIdx(DPPOp,
@@ -253,7 +270,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
AMDGPU::OpName::src1_modifiers)) {
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
AMDGPU::OpName::src1_modifiers));
- assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
+ assert(HasVOP3DPP ||
+ (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
DPPInst.addImm(Mod1->getImm());
++NumOperands;
} else if (AMDGPU::getNamedOperandIdx(DPPOp,
@@ -261,7 +279,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst.addImm(0);
++NumOperands;
}
- if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+ auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
+ if (Src1) {
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
Fail = true;
@@ -270,8 +289,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst.add(*Src1);
++NumOperands;
}
-
- if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
+ if (auto *Mod2 =
+ TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) {
+ assert(NumOperands ==
+ AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers));
+ assert(HasVOP3DPP ||
+ (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
+ DPPInst.addImm(Mod2->getImm());
+ ++NumOperands;
+ }
+ auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
+ if (Src2) {
if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
@@ -279,8 +307,62 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
break;
}
DPPInst.add(*Src2);
+ ++NumOperands;
+ }
+ if (HasVOP3DPP) {
+ auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
+ if (ClampOpr &&
+ AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::clamp) != -1) {
+ DPPInst.addImm(ClampOpr->getImm());
+ }
+ auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in);
+ if (VdstInOpr &&
+ AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::vdst_in) != -1) {
+ DPPInst.add(*VdstInOpr);
+ }
+ auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod);
+ if (OmodOpr &&
+ AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::omod) != -1) {
+ DPPInst.addImm(OmodOpr->getImm());
+ }
+ // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to
+ // all 1.
+ if (auto *OpSelOpr =
+ TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
+ auto OpSel = OpSelOpr->getImm();
+ if (OpSel != 0) {
+ LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n");
+ Fail = true;
+ break;
+ }
+ if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel) != -1)
+ DPPInst.addImm(OpSel);
+ }
+ if (auto *OpSelHiOpr =
+ TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
+ auto OpSelHi = OpSelHiOpr->getImm();
+ // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check
+ // the bitmask for 3 op_sel_hi bits set
+ assert(Src2 && "Expected vop3p with 3 operands");
+ if (OpSelHi != 7) {
+ LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n");
+ Fail = true;
+ break;
+ }
+ if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel_hi) != -1)
+ DPPInst.addImm(OpSelHi);
+ }
+ auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo);
+ if (NegOpr &&
+ AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_lo) != -1) {
+ DPPInst.addImm(NegOpr->getImm());
+ }
+ auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi);
+ if (NegHiOpr &&
+ AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_hi) != -1) {
+ DPPInst.addImm(NegHiOpr->getImm());
+ }
}
-
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
@@ -531,8 +613,16 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
}
bool IsShrinkable = isShrinkable(OrigMI);
- if (!(IsShrinkable || TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
- LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
+ if (!(IsShrinkable ||
+ ((TII->isVOP3P(OrigOp) || TII->isVOPC(OrigOp) ||
+ TII->isVOP3(OrigOp)) &&
+ ST->hasVOP3DPP()) ||
+ TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
+ LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n");
+ break;
+ }
+ if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) {
+ LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n");
break;
}
@@ -543,9 +633,12 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
break;
}
+ auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
assert(Src0 && "Src1 without Src0?");
- if (Src1 && Src1->isIdenticalTo(*Src0)) {
- assert(Src1->isReg());
+ if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) ||
+ (Src2 && Src2->isIdenticalTo(*Src0)))) ||
+ (Use == Src1 && (Src1->isIdenticalTo(*Src0) ||
+ (Src2 && Src2->isIdenticalTo(*Src1))))) {
LLVM_DEBUG(
dbgs()
<< " " << OrigMI
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
new file mode 100644
index 000000000000..a5008e39d91a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -0,0 +1,212 @@
+//===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the AMDGPU DAG scheduling
+/// mutation to pair VOPD instructions back to back. It also contains
+// subroutines useful in the creation of VOPD instructions
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNVOPDUtils.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gcn-vopd-utils"
+
+bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
+ const MachineInstr &FirstMI,
+ const MachineInstr &SecondMI) {
+ const MachineFunction *MF = FirstMI.getMF();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo());
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const unsigned NumVGPRBanks = 4;
+ // Literals also count against scalar bus limit
+ SmallVector<const MachineOperand *> UniqueLiterals;
+ auto addLiteral = [&](const MachineOperand &Op) {
+ for (auto &Literal : UniqueLiterals) {
+ if (Literal->isIdenticalTo(Op))
+ return;
+ }
+ UniqueLiterals.push_back(&Op);
+ };
+ SmallVector<Register> UniqueScalarRegs;
+ assert([&]() -> bool {
+ for (auto MII = MachineBasicBlock::const_iterator(&FirstMI);
+ MII != FirstMI.getParent()->instr_end(); ++MII) {
+ if (&*MII == &SecondMI)
+ return true;
+ }
+ return false;
+ }() && "Expected FirstMI to precede SecondMI");
+ // Cannot pair dependent instructions
+ for (const auto &Use : SecondMI.uses())
+ if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg()))
+ return false;
+
+ struct ComponentInfo {
+ ComponentInfo(const MachineInstr &MI) : MI(MI) {}
+ Register Dst, Reg0, Reg1, Reg2;
+ const MachineInstr &MI;
+ };
+ ComponentInfo CInfo[] = {ComponentInfo(FirstMI), ComponentInfo(SecondMI)};
+
+ for (ComponentInfo &Comp : CInfo) {
+ switch (Comp.MI.getOpcode()) {
+ case AMDGPU::V_FMAMK_F32:
+ // cannot inline the fixed literal in fmamk
+ addLiteral(Comp.MI.getOperand(2));
+ Comp.Reg2 = Comp.MI.getOperand(3).getReg();
+ break;
+ case AMDGPU::V_FMAAK_F32:
+ // cannot inline the fixed literal in fmaak
+ addLiteral(Comp.MI.getOperand(3));
+ Comp.Reg1 = Comp.MI.getOperand(2).getReg();
+ break;
+ case AMDGPU::V_FMAC_F32_e32:
+ case AMDGPU::V_DOT2_F32_F16:
+ case AMDGPU::V_DOT2_F32_BF16:
+ Comp.Reg1 = Comp.MI.getOperand(2).getReg();
+ Comp.Reg2 = Comp.MI.getOperand(0).getReg();
+ break;
+ case AMDGPU::V_CNDMASK_B32_e32:
+ UniqueScalarRegs.push_back(AMDGPU::VCC_LO);
+ Comp.Reg1 = Comp.MI.getOperand(2).getReg();
+ break;
+ case AMDGPU::V_MOV_B32_e32:
+ break;
+ default:
+ Comp.Reg1 = Comp.MI.getOperand(2).getReg();
+ break;
+ }
+
+ Comp.Dst = Comp.MI.getOperand(0).getReg();
+
+ const MachineOperand &Op0 = Comp.MI.getOperand(1);
+ if (Op0.isReg()) {
+ if (!TRI->isVectorRegister(MRI, Op0.getReg())) {
+ if (!is_contained(UniqueScalarRegs, Op0.getReg()))
+ UniqueScalarRegs.push_back(Op0.getReg());
+ } else
+ Comp.Reg0 = Op0.getReg();
+ } else {
+ if (!TII.isInlineConstant(Comp.MI, 1))
+ addLiteral(Op0);
+ }
+ }
+
+ if (UniqueLiterals.size() > 1)
+ return false;
+ if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2)
+ return false;
+
+ // check port 0
+ if (CInfo[0].Reg0 && CInfo[1].Reg0 &&
+ CInfo[0].Reg0 % NumVGPRBanks == CInfo[1].Reg0 % NumVGPRBanks)
+ return false;
+ // check port 1
+ if (CInfo[0].Reg1 && CInfo[1].Reg1 &&
+ CInfo[0].Reg1 % NumVGPRBanks == CInfo[1].Reg1 % NumVGPRBanks)
+ return false;
+ // check port 2
+ if (CInfo[0].Reg2 && CInfo[1].Reg2 &&
+ !((CInfo[0].Reg2 ^ CInfo[1].Reg2) & 0x1))
+ return false;
+ if (!((CInfo[0].Dst ^ CInfo[1].Dst) & 0x1))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI
+ << "\n\tY: " << SecondMI << "\n");
+ return true;
+}
+
+/// Check if the instr pair, FirstMI and SecondMI, should be scheduled
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII,
+ const TargetSubtargetInfo &TSI,
+ const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII);
+ unsigned Opc2 = SecondMI.getOpcode();
+ auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2);
+
+ // One instruction case
+ if (!FirstMI)
+ return SecondCanBeVOPD.Y;
+
+ unsigned Opc = FirstMI->getOpcode();
+ auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc);
+
+ if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) ||
+ (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)))
+ return false;
+
+ return checkVOPDRegConstraints(STII, *FirstMI, SecondMI);
+}
+
+/// Adapts design from MacroFusion
+/// Puts valid candidate instructions back-to-back so they can easily
+/// be turned into VOPD instructions
+/// Greedily pairs instruction candidates. O(n^2) algorithm.
+struct VOPDPairingMutation : ScheduleDAGMutation {
+ ShouldSchedulePredTy shouldScheduleAdjacent; // NOLINT: function pointer
+
+ VOPDPairingMutation(
+ ShouldSchedulePredTy shouldScheduleAdjacent) // NOLINT: function pointer
+ : shouldScheduleAdjacent(shouldScheduleAdjacent) {}
+
+ void apply(ScheduleDAGInstrs *DAG) override {
+ const TargetInstrInfo &TII = *DAG->TII;
+ const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
+ if (!AMDGPU::hasVOPD(ST) || !ST.isWave32()) {
+ LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n");
+ return;
+ }
+
+ std::vector<SUnit>::iterator ISUI, JSUI;
+ for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) {
+ const MachineInstr *IMI = ISUI->getInstr();
+ if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI))
+ continue;
+ if (!hasLessThanNumFused(*ISUI, 2))
+ continue;
+
+ for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) {
+ if (JSUI->isBoundaryNode())
+ continue;
+ const MachineInstr *JMI = JSUI->getInstr();
+ if (!hasLessThanNumFused(*JSUI, 2) ||
+ !shouldScheduleAdjacent(TII, ST, IMI, *JMI))
+ continue;
+ if (fuseInstructionPair(*DAG, *ISUI, *JSUI))
+ break;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n");
+ }
+};
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createVOPDPairingMutation() {
+ return std::make_unique<VOPDPairingMutation>(shouldScheduleVOPDAdjacent);
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h
new file mode 100644
index 000000000000..22361b9a1a07
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h
@@ -0,0 +1,32 @@
+//===- GCNVOPDUtils.h - GCN VOPD Utils ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the AMDGPU DAG scheduling
+/// mutation to pair VOPD instructions back to back. It also contains
+// subroutines useful in the creation of VOPD instructions
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+class SIInstrInfo;
+
+bool checkVOPDRegConstraints(const SIInstrInfo &TII,
+ const MachineInstr &FirstMI,
+ const MachineInstr &SecondMI);
+
+std::unique_ptr<ScheduleDAGMutation> createVOPDPairingMutation();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 02c213f90f89..228963ff2a20 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -62,12 +62,6 @@ public:
virtual void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const = 0;
-
-protected:
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
};
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 11fe3f9ef058..fba4b1a3db66 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -36,6 +36,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "AMDGPUGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 060d4b660632..c2e2563c3989 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -50,6 +50,7 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_OPERAND_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "AMDGPUGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 78eb304fe84f..3d926e52c368 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -58,11 +58,6 @@ private:
uint64_t getBinaryCodeForInstr(const MCInst &MI,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
-
};
} // end anonymous namespace
@@ -90,11 +85,8 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
}
void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- verifyInstructionPredicates(MI,
- computeAvailableFeatures(STI.getFeatureBits()));
-
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
if (MI.getOpcode() == R600::RETURN ||
MI.getOpcode() == R600::FETCH_CLAUSE ||
@@ -187,5 +179,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
return MO.getImm();
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "R600GenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
index 269209a12175..b9ff195e0ddc 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
@@ -13,10 +13,12 @@
#include "R600MCTargetDesc.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "R600GenInstrInfo.inc"
MCInstrInfo *llvm::createR600MCInstrInfo() {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
index 605ae851378d..b4ce748532f8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
@@ -35,6 +35,7 @@ MCInstrInfo *createR600MCInstrInfo();
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_OPERAND_ENUM
#define GET_INSTRINFO_SCHED_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "R600GenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 5e67fb5ec876..e093d78b2cc6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -310,11 +310,8 @@ uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const {
}
void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- verifyInstructionPredicates(MI,
- computeAvailableFeatures(STI.getFeatureBits()));
-
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
int Opcode = MI.getOpcode();
APInt Encoding, Scratch;
getBinaryCodeForInstr(MI, Fixups, Encoding, Scratch, STI);
@@ -574,5 +571,4 @@ void SIMCCodeEmitter::getMachineOpValueCommon(
llvm_unreachable("Encoding of this operand type is not supported yet.");
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index bf52f7830ad7..5199a37a0519 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1623,7 +1623,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
NewBldVec);
}
-SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
+SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],
SelectionDAG &DAG,
const SDLoc &DL) const {
// Old -> New swizzle values
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index 1e75a0432ec3..e7706fa0ef5c 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -74,8 +74,8 @@ private:
void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
MachineRegisterInfo & MRI, unsigned dword_offset) const;
- SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG,
- const SDLoc &DL) const;
+ SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],
+ SelectionDAG &DAG, const SDLoc &DL) const;
SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
index 8f7807a2b472..f81f5122bbc9 100644
--- a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
@@ -13,6 +13,7 @@
//
#include "AMDGPUMCInstLower.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
#include "R600AsmPrinter.h"
#include "R600Subtarget.h"
#include "llvm/CodeGen/MachineOperand.h"
@@ -42,6 +43,9 @@ void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
}
void R600AsmPrinter::emitInstruction(const MachineInstr *MI) {
+ R600_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>();
R600MCInstLower MCInstLowering(OutContext, STI, *this);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 094d5cd58673..d16da2a8b86b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -352,7 +352,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// TODO: Generalize to more vector types.
setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
{MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
- MVT::v4i16, MVT::v4f16, MVT::v16i16, MVT::v16f16},
+ MVT::v4i16, MVT::v4f16},
Custom);
// Deal with vec3 vector operations when widened to vec4.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 814a7c446889..799d34e32d27 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3335,15 +3335,18 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
(ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
MachineInstr *DefMI;
- const auto killDef = [&DefMI, &MBB, this]() -> void {
+ const auto killDef = [&]() -> void {
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
// The only user is the instruction which will be killed.
- if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg()))
+ Register DefReg = DefMI->getOperand(0).getReg();
+ if (!MRI.hasOneNonDBGUse(DefReg))
return;
// We cannot just remove the DefMI here, calling pass will crash.
DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
DefMI->removeOperand(I);
+ if (LV)
+ LV->getVarInfo(DefReg).AliveBlocks.clear();
};
int64_t Imm;
@@ -3982,6 +3985,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+ int Src3Idx = -1;
+ if (Src0Idx == -1) {
+ // VOPD V_DUAL_* instructions use different operand names.
+ Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
+ Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
+ Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
+ Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
+ }
// Make sure the number of operands is correct.
const MCInstrDesc &Desc = get(Opcode);
@@ -4255,9 +4266,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
// Only look at the true operands. Only a real operand can use the constant
// bus, and we don't want to check pseudo-operands like the source modifier
// flags.
- for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
+ for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
if (OpIdx == -1)
- break;
+ continue;
const MachineOperand &MO = MI.getOperand(OpIdx);
if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
if (MO.isReg()) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 311f9f68e675..1b411eb83eb3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1242,6 +1242,9 @@ namespace AMDGPU {
int getDPPOp32(uint16_t Opcode);
LLVM_READONLY
+ int getDPPOp64(uint16_t Opcode);
+
+ LLVM_READONLY
int getBasicFromSDWAOp(uint16_t Opcode);
LLVM_READONLY
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 29ee9f12b12d..23afd6556bc9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -193,43 +193,32 @@ def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
-multiclass SDBufferAtomicRetNoRet {
- def "_ret" : PatFrag<
- (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset,
- node:$offset, node:$cachepolicy, node:$idxen),
- (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex,
- node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
- node:$idxen)> {
- let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }];
- let GISelPredicateCode = [{ return true; }];
- }
-
+multiclass SDBufferAtomicNoRet {
def "_noret" : PatFrag<
(ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset,
node:$offset, node:$cachepolicy, node:$idxen),
(!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex,
node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
node:$idxen)> {
- let PredicateCode = [{ return SDValue(N, 0).use_empty(); }];
- let GISelPredicateCode = [{ return false; }];
+ let HasNoUse = true;
}
}
-defm SIbuffer_atomic_swap : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_add : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_sub : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_smin : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_umin : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_smax : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_umax : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_and : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_or : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_xor : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_inc : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_dec : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_fadd : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_fmin : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_fmax : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_swap : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_add : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_sub : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_smin : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_umin : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_smax : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_umax : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_and : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_or : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_xor : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_inc : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_dec : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_fadd : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_fmin : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_fmax : SDBufferAtomicNoRet;
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
SDTypeProfile<1, 9,
@@ -246,24 +235,13 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
-def SIbuffer_atomic_cmpswap_ret : PatFrag<
- (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset,
- node:$soffset, node:$offset, node:$cachepolicy, node:$idxen),
- (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex,
- node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
- node:$idxen)> {
- let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }];
- let GISelPredicateCode = [{ return true; }];
-}
-
def SIbuffer_atomic_cmpswap_noret : PatFrag<
(ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset,
node:$soffset, node:$offset, node:$cachepolicy, node:$idxen),
(SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex,
node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
node:$idxen)> {
- let PredicateCode = [{ return SDValue(N, 0).use_empty(); }];
- let GISelPredicateCode = [{ return false; }];
+ let HasNoUse = true;
}
class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
@@ -774,13 +752,13 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
let AddressSpaces = StoreAddress_local.AddrSpaces in {
defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
- defm _local_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
+ defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
IsInt>;
}
let AddressSpaces = StoreAddress_region.AddrSpaces in {
defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
- defm _region_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
+ defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
IsInt>;
}
}
@@ -2194,21 +2172,21 @@ class getAsmVOP3DPPBase <int NumSrcArgs, bit HasDst, bit HasClamp,
"$sdst",
"$vdst"),
""); // use $sdst for VOPC
- string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
- string isrc1 = !if(!eq(NumSrcArgs, 1), "",
- !if(!eq(NumSrcArgs, 2), " $src1",
- " $src1,"));
- string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
-
- string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
- string fsrc1 = !if(!eq(NumSrcArgs, 1), "",
- !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
- " $src1_modifiers,"));
- string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
-
- string src0 = !if(Src0HasMods, fsrc0, isrc0);
- string src1 = !if(Src1HasMods, fsrc1, isrc1);
- string src2 = !if(Src2HasMods, fsrc2, isrc2);
+ string src0nomods = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
+ string src1nomods = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1",
+ " $src1,"));
+ string src2nomods = !if(!eq(NumSrcArgs, 3), " $src2", "");
+
+ string src0mods = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+ string src1mods = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+ " $src1_modifiers,"));
+ string src2mods = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
+
+ string src0 = !if(Src0HasMods, src0mods, src0nomods);
+ string src1 = !if(Src1HasMods, src1mods, src1nomods);
+ string src2 = !if(Src2HasMods, src2mods, src2nomods);
string opsel = !if(HasOpSel, "$op_sel", "");
string 3PMods = !if(IsVOP3P,
!if(HasOpSel, "$op_sel_hi", "")
@@ -2559,8 +2537,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
// the asm operand name via this HasModifiers flag
field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
field string AsmVOP3DPPBase = getAsmVOP3DPPBase<NumSrcArgs, HasDst, HasClamp,
- HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasSrc0FloatMods, HasSrc1FloatMods,
- HasSrc2FloatMods, DstVT >.ret;
+ HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
+ HasModifiers, DstVT>.ret;
field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3DPPBase>.ret;
field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3DPPBase>.ret;
field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3DPPBase>.ret;
@@ -2800,6 +2778,14 @@ def getDPPOp32 : InstrMapping {
let ValueCols = [["DPP"]];
}
+def getDPPOp64 : InstrMapping {
+ let FilterClass = "VOP";
+ let RowFields = ["OpName"];
+ let ColFields = ["AsmVariantName"];
+ let KeyCol = ["VOP3"];
+ let ValueCols = [["VOP3_DPP"]];
+}
+
// Maps an commuted opcode to its original version
def getCommuteOrig : InstrMapping {
let FilterClass = "Commutable_REV";
@@ -2961,6 +2947,27 @@ def getVCMPXOpFromVCMP : InstrMapping {
let ValueCols = [["1"]];
}
+def VOPDComponentTable : GenericTable {
+ let FilterClass = "VOPD_Component";
+ let CppTypeName = "VOPDComponentInfo";
+ let Fields = ["BaseVOP", "VOPDOp", "CanBeVOPDX"];
+ let PrimaryKey = ["BaseVOP"];
+ let PrimaryKeyName = "getVOPDComponentHelper";
+}
+
+def VOPDPairs : GenericTable {
+ let FilterClass = "VOPD_Base";
+ let CppTypeName = "VOPDInfo";
+ let Fields = ["Opcode", "OpX", "OpY"];
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getVOPDOpcodeHelper";
+}
+
+def getVOPDInfoFromComponentOpcodes : SearchIndex {
+ let Table = VOPDPairs;
+ let Key = ["OpX", "OpY"];
+}
+
include "SIInstructions.td"
include "DSInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 829669157893..ce8c03bb8d64 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1449,6 +1449,14 @@ def : BitConvert <v8i32, v16f16, VReg_256>;
def : BitConvert <v8i32, v16i16, VReg_256>;
def : BitConvert <v8f32, v16f16, VReg_256>;
def : BitConvert <v8f32, v16i16, VReg_256>;
+def : BitConvert <v16f16, v4i64, VReg_256>;
+def : BitConvert <v16i16, v4i64, VReg_256>;
+def : BitConvert <v16f16, v4f64, VReg_256>;
+def : BitConvert <v16i16, v4f64, VReg_256>;
+def : BitConvert <v4i64, v16f16, VReg_256>;
+def : BitConvert <v4i64, v16i16, VReg_256>;
+def : BitConvert <v4f64, v16f16, VReg_256>;
+def : BitConvert <v4f64, v16i16, VReg_256>;
// 512-bit bitcast
def : BitConvert <v16i32, v16f32, VReg_512>;
@@ -3012,6 +3020,35 @@ multiclass Int16Med3Pat<Instruction med3Inst,
def : FPMed3Pat<f32, V_MED3_F32_e64>;
+class
+IntMinMaxPat<Instruction minmaxInst, SDPatternOperator min_or_max,
+ SDPatternOperator max_or_min_oneuse> : AMDGPUPat <
+ (DivergentBinFrag<min_or_max> (max_or_min_oneuse i32:$src0, i32:$src1),
+ i32:$src2),
+ (minmaxInst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
+>;
+
+class
+FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
+ SDPatternOperator max_or_min_oneuse> : GCNPat <
+ (min_or_max (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods),
+ (VOP3Mods vt:$src1, i32:$src1_mods)),
+ (vt (VOP3Mods vt:$src2, i32:$src2_mods))),
+ (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
+ DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+let OtherPredicates = [isGFX11Plus] in {
+def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>;
+def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>;
+def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>;
+def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>;
+def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+}
+
let OtherPredicates = [isGFX9Plus] in {
def : FP16Med3Pat<f16, V_MED3_F16_e64>;
defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>;
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 607383ab8cde..67077a2eaa6b 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -148,6 +148,7 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addUsedIfAvailable<LiveIntervals>();
// Should preserve the same set that TwoAddressInstructions does.
AU.addPreserved<MachineDominatorTree>();
AU.addPreserved<SlotIndexes>();
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index dd881ec42d53..786b6b61cb23 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -72,7 +72,7 @@ INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE,
char &llvm::SILowerSGPRSpillsID = SILowerSGPRSpills::ID;
-/// Insert restore code for the callee-saved registers used in the function.
+/// Insert spill code for the callee-saved registers used in the function.
static void insertCSRSaves(MachineBasicBlock &SaveBlock,
ArrayRef<CalleeSavedInfo> CSI,
LiveIntervals *LIS) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index e426e938b856..ff5587fbb0ca 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -1883,7 +1883,13 @@ void SIScheduleDAGMI::schedule()
LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
buildDAGWithRegPressure();
+ postprocessDAG();
+
LLVM_DEBUG(dump());
+ if (PrintDAGs)
+ dump();
+ if (ViewMISchedDAGs)
+ viewGraph();
topologicalSort();
findRootsAndBiasEdges(TopRoots, BotRoots);
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 8a66213931ff..6b93769949bc 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2329,13 +2329,13 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
continue;
if (const auto &MOI = MOA.getLoadInfo(MI))
- Changed |= expandLoad(MOI.getValue(), MI);
+ Changed |= expandLoad(MOI.value(), MI);
else if (const auto &MOI = MOA.getStoreInfo(MI))
- Changed |= expandStore(MOI.getValue(), MI);
+ Changed |= expandStore(MOI.value(), MI);
else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
- Changed |= expandAtomicFence(MOI.getValue(), MI);
+ Changed |= expandAtomicFence(MOI.value(), MI);
else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
- Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
+ Changed |= expandAtomicCmpxchgOrRmw(MOI.value(), MI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 5215397d5936..66bc46aaefea 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -9,6 +9,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -20,10 +21,40 @@ using namespace llvm;
namespace {
class SIOptimizeExecMasking : public MachineFunctionPass {
+ MachineFunction *MF = nullptr;
+ const GCNSubtarget *ST = nullptr;
+ const SIRegisterInfo *TRI = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ const MachineRegisterInfo *MRI = nullptr;
+
+ Register isCopyFromExec(const MachineInstr &MI) const;
+ Register isCopyToExec(const MachineInstr &MI) const;
+ bool removeTerminatorBit(MachineInstr &MI) const;
+ MachineBasicBlock::reverse_iterator
+ fixTerminators(MachineBasicBlock &MBB) const;
+ MachineBasicBlock::reverse_iterator
+ findExecCopy(MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I,
+ unsigned CopyToExec) const;
+
+ bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
+ MCRegister Reg, bool UseLiveOuts = false,
+ bool IgnoreStart = false) const;
+ bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const;
+ MachineInstr *findInstrBackwards(MachineInstr &Origin,
+ std::function<bool(MachineInstr *)> Pred,
+ ArrayRef<MCRegister> NonModifiableRegs,
+ unsigned MaxInstructions = 20) const;
+ MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec,
+ MCRegister Exec) const;
+ bool optimizeExecSequence() const;
+ bool optimizeVCmpxAndSaveexecSequence() const;
+ bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
+ MachineInstr &VCmp,
+ MCRegister Exec) const;
+
public:
static char ID;
-public:
SIOptimizeExecMasking() : MachineFunctionPass(ID) {
initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry());
}
@@ -53,7 +84,7 @@ char SIOptimizeExecMasking::ID = 0;
char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
/// If \p MI is a copy from exec, return the register copied to.
-static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) {
+Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::S_MOV_B64:
@@ -61,8 +92,7 @@ static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) {
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B32_term: {
const MachineOperand &Src = MI.getOperand(1);
- if (Src.isReg() &&
- Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC))
+ if (Src.isReg() && Src.getReg() == TRI->getExec())
return MI.getOperand(0).getReg();
}
}
@@ -71,14 +101,13 @@ static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) {
}
/// If \p MI is a copy to exec, return the register copied from.
-static Register isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) {
+Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::S_MOV_B64:
case AMDGPU::S_MOV_B32: {
const MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() &&
- Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) &&
+ if (Dst.isReg() && Dst.getReg() == TRI->getExec() &&
MI.getOperand(1).isReg())
return MI.getOperand(1).getReg();
break;
@@ -173,64 +202,64 @@ static unsigned getSaveExecOp(unsigned Opc) {
// These are only terminators to get correct spill code placement during
// register allocation, so turn them back into normal instructions.
-static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
+bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const {
switch (MI.getOpcode()) {
case AMDGPU::S_MOV_B32_term: {
bool RegSrc = MI.getOperand(1).isReg();
- MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
+ MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
return true;
}
case AMDGPU::S_MOV_B64_term: {
bool RegSrc = MI.getOperand(1).isReg();
- MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
+ MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
return true;
}
case AMDGPU::S_XOR_B64_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
+ MI.setDesc(TII->get(AMDGPU::S_XOR_B64));
return true;
}
case AMDGPU::S_XOR_B32_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_XOR_B32));
+ MI.setDesc(TII->get(AMDGPU::S_XOR_B32));
return true;
}
case AMDGPU::S_OR_B64_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_OR_B64));
+ MI.setDesc(TII->get(AMDGPU::S_OR_B64));
return true;
}
case AMDGPU::S_OR_B32_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_OR_B32));
+ MI.setDesc(TII->get(AMDGPU::S_OR_B32));
return true;
}
case AMDGPU::S_ANDN2_B64_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64));
+ MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64));
return true;
}
case AMDGPU::S_ANDN2_B32_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
+ MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32));
return true;
}
case AMDGPU::S_AND_B64_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_AND_B64));
+ MI.setDesc(TII->get(AMDGPU::S_AND_B64));
return true;
}
case AMDGPU::S_AND_B32_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_AND_B32));
+ MI.setDesc(TII->get(AMDGPU::S_AND_B32));
return true;
}
default:
@@ -241,9 +270,8 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
// Turn all pseudoterminators in the block into their equivalent non-terminator
// instructions. Returns the reverse iterator to the first non-terminator
// instruction in the block.
-static MachineBasicBlock::reverse_iterator fixTerminators(
- const SIInstrInfo &TII,
- MachineBasicBlock &MBB) {
+MachineBasicBlock::reverse_iterator
+SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const {
MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
bool Seen = false;
@@ -252,7 +280,7 @@ static MachineBasicBlock::reverse_iterator fixTerminators(
if (!I->isTerminator())
return Seen ? FirstNonTerm : I;
- if (removeTerminatorBit(TII, *I)) {
+ if (removeTerminatorBit(*I)) {
if (!Seen) {
FirstNonTerm = I;
Seen = true;
@@ -263,17 +291,15 @@ static MachineBasicBlock::reverse_iterator fixTerminators(
return FirstNonTerm;
}
-static MachineBasicBlock::reverse_iterator findExecCopy(
- const SIInstrInfo &TII,
- const GCNSubtarget &ST,
- MachineBasicBlock &MBB,
- MachineBasicBlock::reverse_iterator I,
- unsigned CopyToExec) {
+MachineBasicBlock::reverse_iterator
+SIOptimizeExecMasking::findExecCopy(MachineBasicBlock &MBB,
+ MachineBasicBlock::reverse_iterator I,
+ unsigned CopyToExec) const {
const unsigned InstLimit = 25;
auto E = MBB.rend();
for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
- Register CopyFromExec = isCopyFromExec(*I, ST);
+ Register CopyFromExec = isCopyFromExec(*I);
if (CopyFromExec.isValid())
return I;
}
@@ -298,11 +324,9 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
// an arbitrary condition based on the current MachineInstr, for instance an
// target instruction. Breaks prematurely by returning nullptr if one of the
// registers given in NonModifiableRegs is modified by the current instruction.
-static MachineInstr *
-findInstrBackwards(MachineInstr &Origin,
- std::function<bool(MachineInstr *)> Pred,
- ArrayRef<MCRegister> NonModifiableRegs,
- const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) {
+MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
+ MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
+ ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const {
MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
E = Origin.getParent()->rend();
unsigned CurrentIteration = 0;
@@ -310,7 +334,7 @@ findInstrBackwards(MachineInstr &Origin,
for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
if (A->isDebugInstr())
continue;
-
+
if (Pred(&*A))
return &*A;
@@ -318,209 +342,64 @@ findInstrBackwards(MachineInstr &Origin,
if (A->modifiesRegister(Reg, TRI))
return nullptr;
}
-
+
++CurrentIteration;
}
return nullptr;
}
-
// Determine if a register Reg is not re-defined and still in use
// in the range (Stop..Start].
// It does so by backwards calculating liveness from the end of the BB until
// either Stop or the beginning of the BB is reached.
// After liveness is calculated, we can determine if Reg is still in use and not
// defined inbetween the instructions.
-static bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
- MCRegister Reg, const SIRegisterInfo *TRI,
- MachineRegisterInfo &MRI,
- bool useLiveOuts = false,
- bool ignoreStart = false) {
+bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop,
+ MachineInstr &Start,
+ MCRegister Reg,
+ bool UseLiveOuts,
+ bool IgnoreStart) const {
LivePhysRegs LR(*TRI);
- if (useLiveOuts)
+ if (UseLiveOuts)
LR.addLiveOuts(*Stop.getParent());
MachineBasicBlock::reverse_iterator A(Start);
MachineBasicBlock::reverse_iterator E(Stop);
- if (ignoreStart)
+ if (IgnoreStart)
++A;
for (; A != Stop.getParent()->rend() && A != Stop; ++A) {
LR.stepBackward(*A);
}
- return !LR.available(MRI, Reg);
+ return !LR.available(*MRI, Reg);
}
// Determine if a register Reg is not re-defined and still in use
// in the range (Stop..BB.end].
-static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg,
- const SIRegisterInfo *TRI,
- MachineRegisterInfo &MRI) {
- return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, TRI,
- MRI, true);
+bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop,
+ MCRegister Reg) const {
+ return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true);
}
-// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
-// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
-// to the v_cmp instruction if it is safe to replace the sequence (see the
-// conditions in the function body). This is after register allocation, so some
-// checks on operand dependencies need to be considered.
-static MachineInstr *findPossibleVCMPVCMPXOptimization(
- MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
- const SIInstrInfo *TII, MachineRegisterInfo &MRI) {
-
- MachineInstr *VCmp = nullptr;
-
- Register SaveExecDest = SaveExec.getOperand(0).getReg();
- if (!TRI->isSGPRReg(MRI, SaveExecDest))
- return nullptr;
-
- MachineOperand *SaveExecSrc0 =
- TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
- if (!SaveExecSrc0->isReg())
- return nullptr;
-
- // Try to find the last v_cmp instruction that defs the saveexec input
- // operand without any write to Exec or the saveexec input operand inbetween.
- VCmp = findInstrBackwards(
- SaveExec,
- [&](MachineInstr *Check) {
- return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
- Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
- },
- {Exec, SaveExecSrc0->getReg()}, TRI);
-
- if (!VCmp)
- return nullptr;
-
- MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
- assert(VCmpDest && "Should have an sdst operand!");
-
- // Check if any of the v_cmp source operands is written by the saveexec.
- MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
- if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) &&
- SaveExec.modifiesRegister(Src0->getReg(), TRI))
- return nullptr;
-
- MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
- if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) &&
- SaveExec.modifiesRegister(Src1->getReg(), TRI))
- return nullptr;
-
- // Don't do the transformation if the destination operand is included in
- // it's MBB Live-outs, meaning it's used in any of it's successors, leading
- // to incorrect code if the v_cmp and therefore the def of
- // the dest operand is removed.
- if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
- return nullptr;
-
- // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
- // s_and_saveexec, skip the optimization.
- if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), TRI, MRI,
- false, true) ||
- isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI))
- return nullptr;
-
- // Try to determine if there is a write to any of the VCmp
- // operands between the saveexec and the vcmp.
- // If yes, additional VGPR spilling might need to be inserted. In this case,
- // it's not worth replacing the instruction sequence.
- SmallVector<MCRegister, 2> NonDefRegs;
- if (Src0->isReg())
- NonDefRegs.push_back(Src0->getReg());
-
- if (Src1->isReg())
- NonDefRegs.push_back(Src1->getReg());
-
- if (!findInstrBackwards(
- SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
- NonDefRegs, TRI))
- return nullptr;
-
- return VCmp;
-}
-
-// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
-// operands extracted from a v_cmp ..., s_and_saveexec pattern.
-static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
- MachineInstr &VCmp, MCRegister Exec,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- MachineRegisterInfo &MRI) {
- const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
-
- if (NewOpcode == -1)
- return false;
-
- MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
- MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
-
- Register MoveDest = SaveExecInstr.getOperand(0).getReg();
-
- MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
- if (!SaveExecInstr.uses().empty()) {
- bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32;
- unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
- SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
- .addReg(Exec);
- }
-
- // Omit dst as V_CMPX is implicitly writing to EXEC.
- // Add dummy src and clamp modifiers, if needed.
- auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
- VCmp.getDebugLoc(), TII->get(NewOpcode));
-
- auto TryAddImmediateValueFromNamedOperand =
- [&](unsigned OperandName) -> void {
- if (auto *Mod = TII->getNamedOperand(VCmp, OperandName))
- Builder.addImm(Mod->getImm());
- };
-
- TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
- Builder.add(*Src0);
-
- TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
- Builder.add(*Src1);
-
- TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
-
- // The kill flags may no longer be correct.
- if (Src0->isReg())
- MRI.clearKillFlags(Src0->getReg());
- if (Src1->isReg())
- MRI.clearKillFlags(Src1->getReg());
-
- return true;
-}
-
-bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(MF.getFunction()))
- return false;
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
- MachineRegisterInfo *MRI = &MF.getRegInfo();
- MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-
- // Optimize sequences emitted for control flow lowering. They are originally
- // emitted as the separate operations because spill code may need to be
- // inserted for the saved copy of exec.
- //
- // x = copy exec
- // z = s_<op>_b64 x, y
- // exec = copy z
- // =>
- // x = s_<op>_saveexec_b64 y
- //
+// Optimize sequences emitted for control flow lowering. They are originally
+// emitted as the separate operations because spill code may need to be
+// inserted for the saved copy of exec.
+//
+// x = copy exec
+// z = s_<op>_b64 x, y
+// exec = copy z
+// =>
+// x = s_<op>_saveexec_b64 y
+//
+bool SIOptimizeExecMasking::optimizeExecSequence() const {
+ MCRegister Exec = TRI->getExec();
bool Changed = false;
- for (MachineBasicBlock &MBB : MF) {
- MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB);
+ for (MachineBasicBlock &MBB : *MF) {
+ MachineBasicBlock::reverse_iterator I = fixTerminators(MBB);
MachineBasicBlock::reverse_iterator E = MBB.rend();
if (I == E)
continue;
@@ -532,7 +411,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
unsigned SearchCount = 0;
const unsigned SearchLimit = 5;
while (I != E && SearchCount++ < SearchLimit) {
- CopyToExec = isCopyToExec(*I, ST);
+ CopyToExec = isCopyToExec(*I);
if (CopyToExec)
break;
++I;
@@ -542,8 +421,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
continue;
// Scan backwards to find the def.
- auto CopyToExecInst = &*I;
- auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec);
+ auto *CopyToExecInst = &*I;
+ auto CopyFromExecInst = findExecCopy(MBB, I, CopyToExec);
if (CopyFromExecInst == E) {
auto PrepareExecInst = std::next(I);
if (PrepareExecInst == E)
@@ -574,8 +453,9 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
MachineInstr *SaveExecInst = nullptr;
SmallVector<MachineInstr *, 4> OtherUseInsts;
- for (MachineBasicBlock::iterator J
- = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
+ for (MachineBasicBlock::iterator
+ J = std::next(CopyFromExecInst->getIterator()),
+ JE = I->getIterator();
J != JE; ++J) {
if (SaveExecInst && J->readsRegister(Exec, TRI)) {
LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
@@ -655,58 +535,210 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
CopyFromExec)
- .addReg(OtherOp->getReg());
+ .addReg(OtherOp->getReg());
SaveExecInst->eraseFromParent();
CopyToExecInst->eraseFromParent();
for (MachineInstr *OtherInst : OtherUseInsts) {
- OtherInst->substituteRegister(CopyToExec, Exec,
- AMDGPU::NoSubRegister, *TRI);
+ OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister,
+ *TRI);
}
Changed = true;
}
- // After all s_op_saveexec instructions are inserted,
- // replace (on GFX10.3 and later)
- // v_cmp_* SGPR, IMM, VGPR
- // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
- // with
- // s_mov_b32 EXEC_SGPR_DEST, exec_lo
- // v_cmpx_* IMM, VGPR
- // to reduce pipeline stalls.
- if (ST.hasGFX10_3Insts()) {
- DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
- const unsigned AndSaveExecOpcode =
- ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
-
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- // Record relevant v_cmp / s_and_saveexec instruction pairs for
- // replacement.
- if (MI.getOpcode() != AndSaveExecOpcode)
- continue;
+ return Changed;
+}
- if (MachineInstr *VCmp =
- findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI))
- SaveExecVCmpMapping[&MI] = VCmp;
- }
+// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
+// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
+// to the v_cmp instruction if it is safe to replace the sequence (see the
+// conditions in the function body). This is after register allocation, so some
+// checks on operand dependencies need to be considered.
+MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization(
+ MachineInstr &SaveExec, MCRegister Exec) const {
+
+ MachineInstr *VCmp = nullptr;
+
+ Register SaveExecDest = SaveExec.getOperand(0).getReg();
+ if (!TRI->isSGPRReg(*MRI, SaveExecDest))
+ return nullptr;
+
+ MachineOperand *SaveExecSrc0 =
+ TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
+ if (!SaveExecSrc0->isReg())
+ return nullptr;
+
+ // Try to find the last v_cmp instruction that defs the saveexec input
+ // operand without any write to Exec or the saveexec input operand inbetween.
+ VCmp = findInstrBackwards(
+ SaveExec,
+ [&](MachineInstr *Check) {
+ return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
+ Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
+ },
+ {Exec, SaveExecSrc0->getReg()});
+
+ if (!VCmp)
+ return nullptr;
+
+ MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
+ assert(VCmpDest && "Should have an sdst operand!");
+
+ // Check if any of the v_cmp source operands is written by the saveexec.
+ MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
+ if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
+ SaveExec.modifiesRegister(Src0->getReg(), TRI))
+ return nullptr;
+
+ MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
+ if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
+ SaveExec.modifiesRegister(Src1->getReg(), TRI))
+ return nullptr;
+
+ // Don't do the transformation if the destination operand is included in
+ // it's MBB Live-outs, meaning it's used in any of it's successors, leading
+ // to incorrect code if the v_cmp and therefore the def of
+ // the dest operand is removed.
+ if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
+ return nullptr;
+
+ // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
+ // s_and_saveexec, skip the optimization.
+ if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false,
+ true) ||
+ isRegisterInUseAfter(SaveExec, VCmpDest->getReg()))
+ return nullptr;
+
+ // Try to determine if there is a write to any of the VCmp
+ // operands between the saveexec and the vcmp.
+ // If yes, additional VGPR spilling might need to be inserted. In this case,
+ // it's not worth replacing the instruction sequence.
+ SmallVector<MCRegister, 2> NonDefRegs;
+ if (Src0->isReg())
+ NonDefRegs.push_back(Src0->getReg());
+
+ if (Src1->isReg())
+ NonDefRegs.push_back(Src1->getReg());
+
+ if (!findInstrBackwards(
+ SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
+ NonDefRegs))
+ return nullptr;
+
+ return VCmp;
+}
+
+// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
+// operands extracted from a v_cmp ..., s_and_saveexec pattern.
+bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence(
+ MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const {
+ const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
+
+ if (NewOpcode == -1)
+ return false;
+
+ MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
+
+ Register MoveDest = SaveExecInstr.getOperand(0).getReg();
+
+ MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
+ if (!SaveExecInstr.uses().empty()) {
+ bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32;
+ unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
+ SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
+ .addReg(Exec);
+ }
+
+ // Omit dst as V_CMPX is implicitly writing to EXEC.
+ // Add dummy src and clamp modifiers, if needed.
+ auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
+ VCmp.getDebugLoc(), TII->get(NewOpcode));
+
+ auto TryAddImmediateValueFromNamedOperand =
+ [&](unsigned OperandName) -> void {
+ if (auto *Mod = TII->getNamedOperand(VCmp, OperandName))
+ Builder.addImm(Mod->getImm());
+ };
+
+ TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
+ Builder.add(*Src0);
+
+ TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
+ Builder.add(*Src1);
+
+ TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
+
+ // The kill flags may no longer be correct.
+ if (Src0->isReg())
+ MRI->clearKillFlags(Src0->getReg());
+ if (Src1->isReg())
+ MRI->clearKillFlags(Src1->getReg());
+
+ return true;
+}
+
+// After all s_op_saveexec instructions are inserted,
+// replace (on GFX10.3 and later)
+// v_cmp_* SGPR, IMM, VGPR
+// s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
+// with
+// s_mov_b32 EXEC_SGPR_DEST, exec_lo
+// v_cmpx_* IMM, VGPR
+// to reduce pipeline stalls.
+bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const {
+ if (!ST->hasGFX10_3Insts())
+ return false;
+
+ bool Changed = false;
+
+ DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
+ MCRegister Exec = TRI->getExec();
+ const unsigned AndSaveExecOpcode =
+ ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
+ // Record relevant v_cmp / s_and_saveexec instruction pairs for
+ // replacement.
+ if (MI.getOpcode() != AndSaveExecOpcode)
+ continue;
+
+ if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec))
+ SaveExecVCmpMapping[&MI] = VCmp;
}
+ }
- for (const auto &Entry : SaveExecVCmpMapping) {
- MachineInstr *SaveExecInstr = Entry.getFirst();
- MachineInstr *VCmpInstr = Entry.getSecond();
+ for (const auto &Entry : SaveExecVCmpMapping) {
+ MachineInstr *SaveExecInstr = Entry.getFirst();
+ MachineInstr *VCmpInstr = Entry.getSecond();
- if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII,
- TRI, *MRI)) {
- SaveExecInstr->eraseFromParent();
- VCmpInstr->eraseFromParent();
+ if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) {
+ SaveExecInstr->eraseFromParent();
+ VCmpInstr->eraseFromParent();
- Changed = true;
- }
+ Changed = true;
}
}
return Changed;
}
+
+bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ this->MF = &MF;
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = ST->getInstrInfo();
+ MRI = &MF.getRegInfo();
+
+ bool Changed = optimizeExecSequence();
+ Changed |= optimizeVCmpxAndSaveexecSequence();
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index e5e65a8dbbf1..57dbad468de8 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -159,6 +159,9 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
return false;
Register SelReg = Op1->getReg();
+ if (SelReg.isPhysical())
+ return false;
+
auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS);
if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
return false;
@@ -264,13 +267,11 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
// Try to remove v_cndmask_b32.
if (SelLI) {
- bool CanRemoveSel = SelLI->Query(CmpIdx.getRegSlot()).isKill();
- if (!CanRemoveSel) {
- // Try to shrink the live interval and check for dead def instead.
- LIS->shrinkToUses(SelLI, nullptr);
- CanRemoveSel = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
- }
- if (CanRemoveSel) {
+ // Kill status must be checked before shrinking the live range.
+ bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill();
+ LIS->shrinkToUses(SelLI);
+ bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
+ if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) {
LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ad1455ed20fd..b32d5bb04d5b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2933,6 +2933,10 @@ MCRegister SIRegisterInfo::getVCC() const {
return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
}
+MCRegister SIRegisterInfo::getExec() const {
+ return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+}
+
const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
// VGPR tuples have an alignment requirement on gfx90a variants.
return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 9bfbc253410b..6024158be181 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -344,6 +344,8 @@ public:
MCRegister getVCC() const;
+ MCRegister getExec() const;
+
const TargetRegisterClass *getRegClass(unsigned RCID) const;
// Find reaching register definition
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e4ab72f1095b..2f334e211181 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -277,6 +277,18 @@ struct VOPC64DPPInfo {
uint16_t Opcode;
};
+struct VOPDComponentInfo {
+ uint16_t BaseVOP;
+ uint16_t VOPDOp;
+ bool CanBeVOPDX;
+};
+
+struct VOPDInfo {
+ uint16_t Opcode;
+ uint16_t OpX;
+ uint16_t OpY;
+};
+
#define GET_MTBUFInfoTable_DECL
#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
@@ -293,6 +305,10 @@ struct VOPC64DPPInfo {
#define GET_VOPC64DPPTable_IMPL
#define GET_VOPC64DPP8Table_DECL
#define GET_VOPC64DPP8Table_IMPL
+#define GET_VOPDComponentTable_DECL
+#define GET_VOPDComponentTable_IMPL
+#define GET_VOPDPairs_DECL
+#define GET_VOPDPairs_IMPL
#define GET_WMMAOpcode2AddrMappingTable_DECL
#define GET_WMMAOpcode2AddrMappingTable_IMPL
#define GET_WMMAOpcode3AddrMappingTable_DECL
@@ -398,6 +414,19 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
return Info ? Info->is_gfx940_xdl : false;
}
+CanBeVOPD getCanBeVOPD(unsigned Opc) {
+ const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc);
+ if (Info)
+ return {Info->CanBeVOPDX, 1};
+ else
+ return {0, 0};
+}
+
+unsigned getVOPDOpcode(unsigned Opc) {
+ const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc);
+ return Info ? Info->VOPDOp : ~0u;
+}
+
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
return Info ? Info->Opcode3Addr : ~0u;
@@ -415,6 +444,11 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) {
return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
}
+int getVOPDFull(unsigned OpX, unsigned OpY) {
+ const VOPDInfo *Info = getVOPDInfoFromComponentOpcodes(OpX, OpY);
+ return Info ? Info->Opcode : -1;
+}
+
namespace IsaInfo {
AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index dffeec10a14a..51cf1678207c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -470,6 +470,14 @@ bool getMAIIsDGEMM(unsigned Opc);
LLVM_READONLY
bool getMAIIsGFX940XDL(unsigned Opc);
+struct CanBeVOPD {
+ bool X;
+ bool Y;
+};
+
+LLVM_READONLY
+CanBeVOPD getCanBeVOPD(unsigned Opc);
+
LLVM_READONLY
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
@@ -483,6 +491,12 @@ LLVM_READONLY
int getMCOpcode(uint16_t Opcode, unsigned Gen);
LLVM_READONLY
+unsigned getVOPDOpcode(unsigned Opc);
+
+LLVM_READONLY
+int getVOPDFull(unsigned OpX, unsigned OpY);
+
+LLVM_READONLY
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);
LLVM_READONLY
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 1485a1e63129..b24857edb59a 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -495,9 +495,9 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsDPP16 = !con(InsDPP, (ins FI:$fi));
let InsDPP8 = (ins DstRCDPP:$old,
- Src0DPP:$src0,
- Src1DPP:$src1,
- dpp8:$dpp8, FI:$fi);
+ Src0DPP:$src0,
+ Src1DPP:$src1,
+ dpp8:$dpp8, FI:$fi);
let HasExt = 1;
let HasExtDPP = 1;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index eb6c54a45263..33d3441e94c2 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1108,7 +1108,6 @@ class VOPC64_DPP_Base<bits<10> op, string OpName, VOPProfile P>
// Inst{87-84} ignored by hw
let Inst{91-88} = bank_mask;
let Inst{95-92} = row_mask;
-
}
class VOPC64_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
@@ -1148,7 +1147,6 @@ class VOPC64_DPP8_Base<bits<10> op, string OpName, VOPProfile P>
let Inst{40-32} = fi;
let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
let Inst{95-72} = dpp8{23-0};
-
}
class VOPC64_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 8cd3d2fe2c47..187485ffa3ae 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1215,7 +1215,9 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO
let IsMAI = !if(Features.IsMAI, 1, P.IsMAI);
let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
- let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers));
+ let HasModifiers =
+ !if (Features.IsMAI, 0,
+ !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers));
}
class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Base<P, Features> {
@@ -1414,7 +1416,7 @@ multiclass VOP3_Realtriple_with_name_gfx11<bits<10> op, string opName,
VOP3_Real_dpp8_with_name_gfx11<op, opName, asmName>;
multiclass VOP3Only_Realtriple_with_name_gfx11<bits<10> op, string opName,
- string asmName> :
+ string asmName> :
VOP3_Realtriple_with_name_gfx11<op, opName, asmName, 1>;
multiclass VOP3be_Realtriple_gfx11<
diff --git a/llvm/lib/Target/ARC/ARCAsmPrinter.cpp b/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
index 0390c01eecb1..cee2fc7d2bf0 100644
--- a/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
+++ b/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
@@ -49,6 +49,9 @@ public:
} // end anonymous namespace
void ARCAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ ARC_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
SmallString<128> Str;
raw_svector_ostream O(Str);
diff --git a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
index d4f74fa77fc4..36b00af2c0b4 100644
--- a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
@@ -26,6 +26,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "ARCGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h
index ab06ce46d99f..5f83b48b36af 100644
--- a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h
+++ b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h
@@ -28,6 +28,7 @@ class Target;
// Defines symbolic names for the ARC instructions.
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "ARCGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 48559a89a30a..73970b9c74c5 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -378,13 +378,13 @@ def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Prefers32BitThumb", "true
def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2",
"Prefer 32-bit alignment for loops">;
-def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "1",
+def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "4",
"Model MVE instructions as a 1 beat per tick architecture">;
def FeatureMVEVectorCostFactor2 : SubtargetFeature<"mve2beat", "MVEVectorCostFactor", "2",
"Model MVE instructions as a 2 beats per tick architecture">;
-def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "4",
+def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "1",
"Model MVE instructions as a 4 beats per tick architecture">;
/// Some instructions update CPSR partially, which can add false dependency for
@@ -1450,6 +1450,13 @@ def : ProcessorModel<"cortex-m55", CortexM4Model, [ARMv81mMainline,
HasMVEFloatOps,
FeatureFixCMSE_CVE_2021_35465]>;
+def : ProcessorModel<"cortex-m85", CortexM7Model, [ARMv81mMainline,
+ FeatureDSP,
+ FeatureFPARMv8_D16,
+ FeaturePACBTI,
+ FeatureUseMISched,
+ HasMVEFloatOps]>;
+
def : ProcNoItin<"cortex-a32", [ARMv8a,
FeatureHWDivThumb,
FeatureHWDivARM,
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 4aa28bc5d28d..57cbd7a3b2b8 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1337,6 +1337,10 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
#include "ARMGenMCPseudoLowering.inc"
void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ // TODOD FIXME: Enable feature predicate checks once all the test pass.
+ // ARM_MC::verifyInstructionPredicates(MI->getOpcode(),
+ // getSubtargetInfo().getFeatureBits());
+
const DataLayout &DL = getDataLayout();
MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 85e32c08c74c..e6be93e6480a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -450,6 +450,14 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SELECT, VT, Expand);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
+
+ if (!HasMVEFP) {
+ setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+ setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+ setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+ }
}
setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand);
@@ -13350,14 +13358,14 @@ static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) {
// to make better use of vaddva style instructions.
if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
- !isa<ConstantSDNode>(N0)) {
+ !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
}
// And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
// add(add(add(A, C), reduce(B)), reduce(D))
if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
- N1.getOpcode() == ISD::ADD) {
+ N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
unsigned N0RedOp = 0;
if (!IsVecReduce(N0.getOperand(N0RedOp))) {
N0RedOp = 1;
@@ -13424,7 +13432,7 @@ static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) {
};
SDValue X;
- if (N0.getOpcode() == ISD::ADD) {
+ if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
N0.getOperand(1).getOperand(0));
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 3a9946ee810b..ba1d806c8d81 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2247,15 +2247,15 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
return canTailPredicateLoop(L, LI, SE, DL, LAI);
}
-bool ARMTTIImpl::emitGetActiveLaneMask() const {
+PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const {
if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
- return false;
+ return PredicationStyle::None;
// Intrinsic @llvm.get.active.lane.mask is supported.
// It is used in the MVETailPredication pass, which requires the number of
// elements processed by this vector loop to setup the tail-predicated
// loop.
- return true;
+ return PredicationStyle::Data;
}
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index d7a2bdb3db15..dcf82e703a7f 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -298,7 +298,7 @@ public:
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE);
- bool emitGetActiveLaneMask() const;
+ PredicationStyle emitGetActiveLaneMask() const;
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 3f1379f135d1..9f85d72cc810 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -133,6 +133,7 @@ static bool getARMLoadDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
}
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "ARMGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index e0c992f4fae2..3066d9ba6783 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -139,6 +139,7 @@ bool isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI);
// Defines symbolic names for the ARM instructions.
//
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "ARMGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 30785340ef12..296801094fbe 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -351,13 +351,13 @@ Optional<int64_t> MVEGatherScatterLowering::getIfConst(const Value *V) {
if (!Op0 || !Op1)
return Optional<int64_t>{};
if (I->getOpcode() == Instruction::Add)
- return Optional<int64_t>{Op0.getValue() + Op1.getValue()};
+ return Optional<int64_t>{Op0.value() + Op1.value()};
if (I->getOpcode() == Instruction::Mul)
- return Optional<int64_t>{Op0.getValue() * Op1.getValue()};
+ return Optional<int64_t>{Op0.value() * Op1.value()};
if (I->getOpcode() == Instruction::Shl)
- return Optional<int64_t>{Op0.getValue() << Op1.getValue()};
+ return Optional<int64_t>{Op0.value() << Op1.value()};
if (I->getOpcode() == Instruction::Or)
- return Optional<int64_t>{Op0.getValue() | Op1.getValue()};
+ return Optional<int64_t>{Op0.value() | Op1.value()};
}
return Optional<int64_t>{};
}
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index 0001e520b1fb..70fc90bf9eb5 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -180,6 +180,10 @@ bool AVRAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
}
void AVRAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ // FIXME: Enable feature predicate checks once all the test pass.
+ // AVR_MC::verifyInstructionPredicates(MI->getOpcode(),
+ // getSubtargetInfo().getFeatureBits());
+
AVRMCInstLower MCInstLowering(OutContext, *this);
MCInst I;
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index cdfe4a21105d..ba370261e284 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -27,6 +27,7 @@
#include "llvm/MC/TargetRegistry.h"
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "AVRGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
index aaf236d82016..e83d674f87cc 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
@@ -49,6 +49,7 @@ std::unique_ptr<MCObjectTargetWriter> createAVRELFObjectWriter(uint8_t OSABI);
#include "AVRGenRegisterInfo.inc"
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "AVRGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index 349cdd92ae62..9aad9375d913 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -149,6 +149,13 @@ private:
// The base call is not an input of any other preserve_*
// intrinsics.
std::map<CallInst *, CallInfo> BaseAICalls;
+ // A map to hold <AnonRecord, TypeDef> relationships
+ std::map<DICompositeType *, DIDerivedType *> AnonRecords;
+
+ void CheckAnonRecordType(DIDerivedType *ParentTy, DIType *Ty);
+ void CheckCompositeType(DIDerivedType *ParentTy, DICompositeType *CTy);
+ void CheckDerivedType(DIDerivedType *ParentTy, DIDerivedType *DTy);
+ void ResetMetadata(struct CallInfo &CInfo);
bool doTransformation(Function &F);
@@ -221,10 +228,80 @@ bool BPFAbstractMemberAccess::run(Function &F) {
if (M->debug_compile_units().empty())
return false;
+ // For each argument/return/local_variable type, trace the type
+ // pattern like '[derived_type]* [composite_type]' to check
+ // and remember (anon record -> typedef) relations where the
+ // anon record is defined as
+ // typedef [const/volatile/restrict]* [anon record]
+ DISubprogram *SP = F.getSubprogram();
+ if (SP && SP->isDefinition()) {
+ for (DIType *Ty: SP->getType()->getTypeArray())
+ CheckAnonRecordType(nullptr, Ty);
+ for (const DINode *DN : SP->getRetainedNodes()) {
+ if (const auto *DV = dyn_cast<DILocalVariable>(DN))
+ CheckAnonRecordType(nullptr, DV->getType());
+ }
+ }
+
DL = &M->getDataLayout();
return doTransformation(F);
}
+void BPFAbstractMemberAccess::ResetMetadata(struct CallInfo &CInfo) {
+ if (auto Ty = dyn_cast<DICompositeType>(CInfo.Metadata)) {
+ if (AnonRecords.find(Ty) != AnonRecords.end()) {
+ if (AnonRecords[Ty] != nullptr)
+ CInfo.Metadata = AnonRecords[Ty];
+ }
+ }
+}
+
+void BPFAbstractMemberAccess::CheckCompositeType(DIDerivedType *ParentTy,
+ DICompositeType *CTy) {
+ if (!CTy->getName().empty() || !ParentTy ||
+ ParentTy->getTag() != dwarf::DW_TAG_typedef)
+ return;
+
+ if (AnonRecords.find(CTy) == AnonRecords.end()) {
+ AnonRecords[CTy] = ParentTy;
+ return;
+ }
+
+ // Two or more typedef's may point to the same anon record.
+ // If this is the case, set the typedef DIType to be nullptr
+ // to indicate the duplication case.
+ DIDerivedType *CurrTy = AnonRecords[CTy];
+ if (CurrTy == ParentTy)
+ return;
+ AnonRecords[CTy] = nullptr;
+}
+
+void BPFAbstractMemberAccess::CheckDerivedType(DIDerivedType *ParentTy,
+ DIDerivedType *DTy) {
+ DIType *BaseType = DTy->getBaseType();
+ if (!BaseType)
+ return;
+
+ unsigned Tag = DTy->getTag();
+ if (Tag == dwarf::DW_TAG_pointer_type)
+ CheckAnonRecordType(nullptr, BaseType);
+ else if (Tag == dwarf::DW_TAG_typedef)
+ CheckAnonRecordType(DTy, BaseType);
+ else
+ CheckAnonRecordType(ParentTy, BaseType);
+}
+
+void BPFAbstractMemberAccess::CheckAnonRecordType(DIDerivedType *ParentTy,
+ DIType *Ty) {
+ if (!Ty)
+ return;
+
+ if (auto *CTy = dyn_cast<DICompositeType>(Ty))
+ return CheckCompositeType(ParentTy, CTy);
+ else if (auto *DTy = dyn_cast<DIDerivedType>(Ty))
+ return CheckDerivedType(ParentTy, DTy);
+}
+
static bool SkipDIDerivedTag(unsigned Tag, bool skipTypedef) {
if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type &&
Tag != dwarf::DW_TAG_volatile_type &&
@@ -298,6 +375,7 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index);
if (!CInfo.Metadata)
report_fatal_error("Missing metadata for llvm.preserve.union.access.index intrinsic");
+ ResetMetadata(CInfo);
CInfo.AccessIndex = getConstant(Call->getArgOperand(1));
CInfo.Base = Call->getArgOperand(0);
return true;
@@ -307,6 +385,7 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index);
if (!CInfo.Metadata)
report_fatal_error("Missing metadata for llvm.preserve.struct.access.index intrinsic");
+ ResetMetadata(CInfo);
CInfo.AccessIndex = getConstant(Call->getArgOperand(2));
CInfo.Base = Call->getArgOperand(0);
CInfo.RecordAlignment = DL->getABITypeAlign(getBaseElementType(Call));
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index d6145f53c170..c8849bd50464 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -138,6 +138,9 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
}
void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ BPF_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
MCInst TmpInst;
if (!BTF || !BTF->InstLower(MI, TmpInst)) {
diff --git a/llvm/lib/Target/BPF/BTF.h b/llvm/lib/Target/BPF/BTF.h
index 4540054aaf34..89852be4a8c8 100644
--- a/llvm/lib/Target/BPF/BTF.h
+++ b/llvm/lib/Target/BPF/BTF.h
@@ -48,6 +48,8 @@
#ifndef LLVM_LIB_TARGET_BPF_BTF_H
#define LLVM_LIB_TARGET_BPF_BTF_H
+#include <cstdint>
+
namespace llvm {
namespace BTF {
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index a98d001097bc..cb321906db03 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -31,14 +31,13 @@ using namespace llvm;
namespace {
class BPFMCCodeEmitter : public MCCodeEmitter {
- const MCInstrInfo &MCII;
const MCRegisterInfo &MRI;
bool IsLittleEndian;
public:
- BPFMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
+ BPFMCCodeEmitter(const MCInstrInfo &, const MCRegisterInfo &mri,
bool IsLittleEndian)
- : MCII(mcii), MRI(mri), IsLittleEndian(IsLittleEndian) {}
+ : MRI(mri), IsLittleEndian(IsLittleEndian) { }
BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
void operator=(const BPFMCCodeEmitter &) = delete;
~BPFMCCodeEmitter() override = default;
@@ -62,12 +61,6 @@ public:
void encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
-
-private:
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
};
} // end anonymous namespace
@@ -117,9 +110,6 @@ static uint8_t SwapBits(uint8_t Val)
void BPFMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
- verifyInstructionPredicates(MI,
- computeAvailableFeatures(STI.getFeatureBits()));
-
unsigned Opcode = MI.getOpcode();
support::endian::Writer OSE(OS,
IsLittleEndian ? support::little : support::big);
@@ -174,5 +164,4 @@ uint64_t BPFMCCodeEmitter::getMemoryOpValue(const MCInst &MI, unsigned Op,
return Encoding;
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "BPFGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 5a1e251cd29c..77db5f99225e 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -22,6 +22,7 @@
#include "llvm/Support/Host.h"
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "BPFGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
index fc190504581c..ea30e714a5b7 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
@@ -54,6 +54,7 @@ std::unique_ptr<MCObjectTargetWriter> createBPFELFObjectWriter(uint8_t OSABI);
// Defines symbolic names for the BPF instructions.
//
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "BPFGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
index 0236b22ad379..ea5b4555757e 100644
--- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
@@ -141,6 +141,9 @@ void CSKYAsmPrinter::emitEndOfAsmFile(Module &M) {
}
void CSKYAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ CSKY_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
// Do any auto-generated pseudo lowerings.
if (emitPseudoExpansionLowering(*OutStreamer, MI))
return;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
index 300ecceae906..8d3835b22bb0 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
@@ -153,7 +153,7 @@ def CSKYSymbol : AsmOperandClass {
let ParserMethod = "parseCSKYSymbol";
}
-def br_symbol : Operand<iPTR> {
+def br_symbol : Operand<OtherVT> {
let EncoderMethod =
"getBranchSymbolOpValue<CSKY::fixup_csky_pcrel_imm16_scale2>";
let ParserMatchClass = CSKYSymbol;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
index 3be1ca8b7998..2d7fb85e89fa 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
@@ -24,7 +24,7 @@ def CSKY_NIR : SDNode<"CSKYISD::NIR", SDTNone,
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
-def br_symbol_16bit : Operand<iPTR> {
+def br_symbol_16bit : Operand<OtherVT> {
let EncoderMethod =
"getBranchSymbolOpValue<CSKY::fixup_csky_pcrel_imm10_scale2>";
let ParserMatchClass = CSKYSymbol;
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
index 1a69dc8acde0..64f01cd1c9fa 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
@@ -26,6 +26,7 @@
#include "llvm/MC/TargetRegistry.h"
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "CSKYGenInstrInfo.inc"
#define GET_REGINFO_MC_DESC
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
index 4b8c45e95b74..1137b4d6e9b1 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h
@@ -41,6 +41,7 @@ MCCodeEmitter *createCSKYMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx);
#include "CSKYGenRegisterInfo.inc"
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "CSKYGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 4d6e1a9d3166..709279889653 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -116,7 +116,7 @@ def ThreadId :dxil_op< "ThreadId", 93, ThreadIdClass, ComputeID, "reads the thr
dxil_param<1, "i32", "opcode", "DXIL opcode">,
dxil_param<2, "i32", "component", "component to read (x,y,z)">
]>,
- dxil_map_intrinsic<int_dxil_thread_id>;
+ dxil_map_intrinsic<int_dx_thread_id>;
def GroupId :dxil_op< "GroupId", 94, GroupIdClass, ComputeID, "reads the group ID (SV_GroupID)", "i32;", "rn",
[
@@ -124,7 +124,7 @@ def GroupId :dxil_op< "GroupId", 94, GroupIdClass, ComputeID, "reads the group
dxil_param<1, "i32", "opcode", "DXIL opcode">,
dxil_param<2, "i32", "component", "component to read">
]>,
- dxil_map_intrinsic<int_dxil_group_id>;
+ dxil_map_intrinsic<int_dx_group_id>;
def ThreadIdInGroup :dxil_op< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeID,
"reads the thread ID within the group (SV_GroupThreadID)", "i32;", "rn",
@@ -133,7 +133,7 @@ def ThreadIdInGroup :dxil_op< "ThreadIdInGroup", 95, ThreadIdInGroupClass, Comp
dxil_param<1, "i32", "opcode", "DXIL opcode">,
dxil_param<2, "i32", "component", "component to read (x,y,z)">
]>,
- dxil_map_intrinsic<int_dxil_thread_id_in_group>;
+ dxil_map_intrinsic<int_dx_thread_id_in_group>;
def FlattenedThreadIdInGroup :dxil_op< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeID,
"provides a flattened index for a given thread within a given group (SV_GroupIndex)", "i32;", "rn",
@@ -141,4 +141,4 @@ def FlattenedThreadIdInGroup :dxil_op< "FlattenedThreadIdInGroup", 96, Flattene
dxil_param<0, "i32", "", "result">,
dxil_param<1, "i32", "opcode", "DXIL opcode">
]>,
- dxil_map_intrinsic<int_dxil_flattened_thread_id_in_group>;
+ dxil_map_intrinsic<int_dx_flattened_thread_id_in_group>;
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 494a71e51a89..3e09270a66d0 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -595,6 +595,10 @@ unsigned DXILBitcodeWriter::getEncodedRMWOperation(AtomicRMWInst::BinOp Op) {
return bitc::RMW_FADD;
case AtomicRMWInst::FSub:
return bitc::RMW_FSUB;
+ case AtomicRMWInst::FMax:
+ return bitc::RMW_FMAX;
+ case AtomicRMWInst::FMin:
+ return bitc::RMW_FMIN;
}
}
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 48d339234e9e..1064296b0991 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -743,6 +743,9 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
/// Print out a single Hexagon MI to the current output stream.
void HexagonAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ Hexagon_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
MCInst MCB;
MCB.setOpcode(Hexagon::BUNDLE);
MCB.addOperand(MCOperand::createImm(0));
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 0b4a95bc9ce5..01501109f3b1 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1024,7 +1024,7 @@ void HexagonFrameLowering::insertCFIInstructions(MachineFunction &MF) const {
for (auto &B : MF) {
auto At = findCFILocation(B);
if (At)
- insertCFIInstructionsAt(B, At.getValue());
+ insertCFIInstructionsAt(B, At.value());
}
}
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index ed2856eb1fe9..9c235776c160 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -376,11 +376,9 @@ void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
State.Bundle = &MI;
State.Index = 0;
size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1;
- FeatureBitset Features = computeAvailableFeatures(STI.getFeatureBits());
for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) {
MCInst &HMI = const_cast<MCInst &>(*I.getInst());
- verifyInstructionPredicates(HMI, Features);
EncodeSingleInstruction(HMI, OS, Fixups, STI, parseBits(Last, HMB, HMI));
State.Extended = HexagonMCInstrInfo::isImmext(HMI);
@@ -793,5 +791,4 @@ MCCodeEmitter *llvm::createHexagonMCCodeEmitter(MCInstrInfo const &MII,
return new HexagonMCCodeEmitter(MII, MCT);
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "HexagonGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
index 9e86dc8e4989..151964bf818b 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -81,11 +81,6 @@ private:
// Return parse bits for instruction `MCI' inside bundle `MCB'
uint32_t parseBits(size_t Last, MCInst const &MCB, MCInst const &MCI) const;
-
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index d068baf05998..f2d1173cd503 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -46,6 +46,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "HexagonGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index d717e710f3c0..3932077c08f1 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -110,6 +110,7 @@ unsigned HexagonConvertUnits(unsigned ItinUnits, unsigned *Lanes);
//
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_SCHED_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "HexagonGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index d715ba901a2b..33e7068622f1 100644
--- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -705,14 +705,14 @@ LanaiAsmParser::parseRegister(bool RestoreOnFailure) {
RegNum = MatchRegisterName(Lexer.getTok().getIdentifier());
if (RegNum == 0) {
if (PercentTok && RestoreOnFailure)
- Lexer.UnLex(PercentTok.getValue());
+ Lexer.UnLex(PercentTok.value());
return nullptr;
}
Parser.Lex(); // Eat identifier token
return LanaiOperand::createReg(RegNum, Start, End);
}
if (PercentTok && RestoreOnFailure)
- Lexer.UnLex(PercentTok.getValue());
+ Lexer.UnLex(PercentTok.value());
return nullptr;
}
diff --git a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
index c0b7fd3fdd5d..d142fd3a414f 100644
--- a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
+++ b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -195,6 +195,9 @@ void LanaiAsmPrinter::customEmitInstruction(const MachineInstr *MI) {
}
void LanaiAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ Lanai_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
MachineBasicBlock::const_instr_iterator I = MI->getIterator();
MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index eb6bf8d3836c..c43450869832 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -28,6 +28,7 @@
#include <string>
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "LanaiGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
index e8da1bc88142..93fe1a4609d8 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
@@ -43,6 +43,7 @@ std::unique_ptr<MCObjectTargetWriter> createLanaiELFObjectWriter(uint8_t OSABI);
// Defines symbolic names for the Lanai instructions.
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "LanaiGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index dd61bb2df077..1467d1757ff0 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -27,6 +27,9 @@ using namespace llvm;
#include "LoongArchGenMCPseudoLowering.inc"
void LoongArchAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ LoongArch_MC::verifyInstructionPredicates(
+ MI->getOpcode(), getSubtargetInfo().getFeatureBits());
+
// Do any auto-generated pseudo lowerings.
if (emitPseudoExpansionLowering(*OutStreamer, MI))
return;
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
index 7e5aa49f227c..b51c19188051 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
@@ -39,6 +39,10 @@ public:
// tblgen'erated function.
bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
const MachineInstr *MI);
+ // Wrapper needed for tblgenned pseudo lowering.
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+ return lowerLoongArchMachineOperandToMCOperand(MO, MCOp, *this);
+ }
};
} // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index 5b117d40e0a9..20448492a558 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -11,6 +11,22 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
+// LoongArch specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDT_LoongArchMOVGR2FR_W_LA64
+ : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i64>]>;
+def SDT_LoongArchMOVFR2GR_S_LA64
+ : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>;
+def SDT_LoongArchFTINT : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
+
+def loongarch_movgr2fr_w_la64
+ : SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>;
+def loongarch_movfr2gr_s_la64
+ : SDNode<"LoongArchISD::MOVFR2GR_S_LA64", SDT_LoongArchMOVFR2GR_S_LA64>;
+def loongarch_ftint : SDNode<"LoongArchISD::FTINT", SDT_LoongArchFTINT>;
+
+//===----------------------------------------------------------------------===//
// Instructions
//===----------------------------------------------------------------------===//
@@ -149,6 +165,7 @@ def : PatFPSetcc<SETULT, FCMP_CULT_S, FPR32>;
def : PatFPSetcc<SETULE, FCMP_CULE_S, FPR32>;
def : PatFPSetcc<SETUNE, FCMP_CUNE_S, FPR32>;
def : PatFPSetcc<SETUO, FCMP_CUN_S, FPR32>;
+def : PatFPSetcc<SETLT, FCMP_CLT_S, FPR32>;
// TODO: Match signaling comparison strict_fsetccs with FCMP_S*_S instructions.
@@ -174,4 +191,39 @@ def : PatFPSelectcc<SETULE, FCMP_CULE_S, FSEL_S, FPR32>;
def : PatFPSelectcc<SETUNE, FCMP_CUNE_S, FSEL_S, FPR32>;
def : PatFPSelectcc<SETUO, FCMP_CUN_S, FSEL_S, FPR32>;
+/// Loads
+
+defm : LdPat<load, FLD_S, f32>;
+
+/// Stores
+
+defm : StPat<store, FST_S, FPR32, f32>;
+
+/// Floating point constants
+
+def : Pat<(f32 fpimm0), (MOVGR2FR_W R0)>;
+def : Pat<(f32 fpimm0neg), (FNEG_S (MOVGR2FR_W R0))>;
+def : Pat<(f32 fpimm1), (FFINT_S_W (MOVGR2FR_W (ADDI_W R0, 1)))>;
+
+// FP Conversion
+def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_W_S FPR32:$src)>;
} // Predicates = [HasBasicF]
+
+let Predicates = [HasBasicF, IsLA64] in {
+// GPR -> FPR
+def : Pat<(loongarch_movgr2fr_w_la64 GPR:$src), (MOVGR2FR_W GPR:$src)>;
+// FPR -> GPR
+def : Pat<(loongarch_movfr2gr_s_la64 FPR32:$src),
+ (MOVFR2GR_S FPR32:$src)>;
+// int -> f32
+def : Pat<(f32 (sint_to_fp GPR:$src)), (FFINT_S_W (MOVGR2FR_W GPR:$src))>;
+} // Predicates = [HasBasicF, IsLA64]
+
+let Predicates = [HasBasicF, IsLA32] in {
+// GPR -> FPR
+def : Pat<(bitconvert (i32 GPR:$src)), (MOVGR2FR_W GPR:$src)>;
+// FPR -> GPR
+def : Pat<(i32 (bitconvert FPR32:$src)), (MOVFR2GR_S FPR32:$src)>;
+// int -> f32
+def : Pat<(f32 (sint_to_fp (i32 GPR:$src))), (FFINT_S_W (MOVGR2FR_W GPR:$src))>;
+} // Predicates = [HasBasicF, IsLA64]
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index 07fa61f4c361..bb50cec9f4c0 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -131,6 +131,11 @@ def MOVGR2FR_D : FP_MOV<0b0000000100010100101010, "movgr2fr.d", FPR64, GPR>;
def MOVFR2GR_D : FP_MOV<0b0000000100010100101110, "movfr2gr.d", GPR, FPR64>;
} // Predicates = [HasBasicD, IsLA64]
+// Instructions only available on LA32
+let Predicates = [HasBasicD, IsLA32], isCodeGenOnly = 1 in {
+def MOVGR2FR_W_64 : FP_MOV<0b0000000100010100101001, "movgr2fr.w", FPR64, GPR>;
+} // Predicates = [HasBasicD, IsLA32], isCodeGenOnly = 1
+
//===----------------------------------------------------------------------===//
// Pseudo-instructions and codegen patterns
//===----------------------------------------------------------------------===//
@@ -164,6 +169,7 @@ def : PatFPSetcc<SETULT, FCMP_CULT_D, FPR64>;
def : PatFPSetcc<SETULE, FCMP_CULE_D, FPR64>;
def : PatFPSetcc<SETUNE, FCMP_CUNE_D, FPR64>;
def : PatFPSetcc<SETUO, FCMP_CUN_D, FPR64>;
+def : PatFPSetcc<SETLT, FCMP_CLT_D, FPR64>;
// TODO: Match signaling comparison strict_fsetccs with FCMP_S*_D instructions.
@@ -185,4 +191,52 @@ def : PatFPSelectcc<SETULE, FCMP_CULE_D, FSEL_D, FPR64>;
def : PatFPSelectcc<SETUNE, FCMP_CUNE_D, FSEL_D, FPR64>;
def : PatFPSelectcc<SETUO, FCMP_CUN_D, FSEL_D, FPR64>;
+/// Loads
+
+defm : LdPat<load, FLD_D, f64>;
+
+/// Stores
+
+defm : StPat<store, FST_D, FPR64, f64>;
+
+/// FP conversion operations
+
+def : Pat<(loongarch_ftint FPR64:$src), (FTINTRZ_W_D FPR64:$src)>;
+def : Pat<(f64 (loongarch_ftint FPR64:$src)), (FTINTRZ_L_D FPR64:$src)>;
+def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_L_S FPR32:$src)>;
+
+// f64 -> f32
+def : Pat<(f32 (fpround FPR64:$src)), (FCVT_S_D FPR64:$src)>;
+// f32 -> f64
+def : Pat<(f64 (fpextend FPR32:$src)), (FCVT_D_S FPR32:$src)>;
} // Predicates = [HasBasicD]
+
+/// Floating point constants
+
+let Predicates = [HasBasicD, IsLA64] in {
+def : Pat<(f64 fpimm0), (MOVGR2FR_D R0)>;
+def : Pat<(f64 fpimm0neg), (FNEG_D (MOVGR2FR_D R0))>;
+def : Pat<(f64 fpimm1), (FFINT_D_L (MOVGR2FR_D (ADDI_D R0, 1)))>;
+
+// Convert int to FP
+def : Pat<(f64 (sint_to_fp (i64 (sexti32 (i64 GPR:$src))))),
+ (FFINT_D_W (MOVGR2FR_W GPR:$src))>;
+def : Pat<(f64 (sint_to_fp GPR:$src)), (FFINT_D_L (MOVGR2FR_D GPR:$src))>;
+
+def : Pat<(f64 (uint_to_fp (i64 (zexti32 (i64 GPR:$src))))),
+ (FFINT_D_W (MOVGR2FR_W GPR:$src))>;
+
+def : Pat<(bitconvert GPR:$src), (MOVGR2FR_D GPR:$src)>;
+
+// Convert FP to int
+def : Pat<(bitconvert FPR64:$src), (MOVFR2GR_D FPR64:$src)>;
+} // Predicates = [HasBasicD, IsLA64]
+
+let Predicates = [HasBasicD, IsLA32] in {
+def : Pat<(f64 fpimm0), (MOVGR2FRH_W (MOVGR2FR_W_64 R0), R0)>;
+def : Pat<(f64 fpimm0neg), (FNEG_D (MOVGR2FRH_W (MOVGR2FR_W_64 R0), R0))>;
+def : Pat<(f64 fpimm1), (FCVT_D_S (FFINT_S_W (MOVGR2FR_W (ADDI_W R0, 1))))>;
+
+// Convert int to FP
+def : Pat<(f64 (sint_to_fp (i32 GPR:$src))), (FFINT_D_W (MOVGR2FR_W GPR:$src))>;
+} // Predicates = [HasBasicD, IsLA32]
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index 7182d55ca3cf..0d9ec9e2eaaa 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -11,7 +11,9 @@
//===----------------------------------------------------------------------===//
#include "LoongArchFrameLowering.h"
+#include "LoongArchMachineFunctionInfo.h"
#include "LoongArchSubtarget.h"
+#include "MCTargetDesc/LoongArchBaseInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -44,12 +46,178 @@ bool LoongArchFrameLowering::hasBP(const MachineFunction &MF) const {
return MFI.hasVarSizedObjects() && TRI->hasStackRealignment(MF);
}
+void LoongArchFrameLowering::adjustReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, Register DestReg,
+ Register SrcReg, int64_t Val,
+ MachineInstr::MIFlag Flag) const {
+ const LoongArchInstrInfo *TII = STI.getInstrInfo();
+ bool IsLA64 = STI.is64Bit();
+
+ if (DestReg == SrcReg && Val == 0)
+ return;
+
+ if (isInt<12>(Val)) {
+ // addi.w/d $DstReg, $SrcReg, Val
+ BuildMI(MBB, MBBI, DL,
+ TII->get(IsLA64 ? LoongArch::ADDI_D : LoongArch::ADDI_W), DestReg)
+ .addReg(SrcReg)
+ .addImm(Val)
+ .setMIFlag(Flag);
+ return;
+ }
+
+ report_fatal_error("adjustReg cannot yet handle adjustments >12 bits");
+}
+
+// Determine the size of the frame and maximum call frame size.
+void LoongArchFrameLowering::determineFrameLayout(MachineFunction &MF) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Get the number of bytes to allocate from the FrameInfo.
+ uint64_t FrameSize = MFI.getStackSize();
+
+ // Make sure the frame is aligned.
+ FrameSize = alignTo(FrameSize, getStackAlign());
+
+ // Update frame info.
+ MFI.setStackSize(FrameSize);
+}
+
void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- // TODO: Implement this when we have function calls
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const LoongArchRegisterInfo *RI = STI.getRegisterInfo();
+ const LoongArchInstrInfo *TII = STI.getInstrInfo();
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+
+ Register SPReg = LoongArch::R3;
+ Register FPReg = LoongArch::R22;
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
+
+ // Determine the correct frame layout
+ determineFrameLayout(MF);
+
+ // First, compute final stack size.
+ uint64_t StackSize = MFI.getStackSize();
+
+ // Early exit if there is no need to allocate space in the stack.
+ if (StackSize == 0 && !MFI.adjustsStack())
+ return;
+
+ // Adjust stack.
+ adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);
+ // Emit ".cfi_def_cfa_offset StackSize".
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ const auto &CSI = MFI.getCalleeSavedInfo();
+
+ // The frame pointer is callee-saved, and code has been generated for us to
+ // save it to the stack. We need to skip over the storing of callee-saved
+ // registers as the frame pointer must be modified after it has been saved
+ // to the stack, not before.
+ std::advance(MBBI, CSI.size());
+
+ // Iterate over list of callee-saved registers and emit .cfi_offset
+ // directives.
+ for (const auto &Entry : CSI) {
+ int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx());
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, RI->getDwarfRegNum(Entry.getReg(), true), Offset));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Generate new FP.
+ if (hasFP(MF)) {
+ adjustReg(MBB, MBBI, DL, FPReg, SPReg, StackSize, MachineInstr::FrameSetup);
+
+ // Emit ".cfi_def_cfa $fp, 0"
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
+ nullptr, RI->getDwarfRegNum(FPReg, true), 0));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
}
void LoongArchFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- // TODO: Implement this when we have function calls
+ const LoongArchRegisterInfo *RI = STI.getRegisterInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ Register SPReg = LoongArch::R3;
+
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+ const auto &CSI = MFI.getCalleeSavedInfo();
+ // Skip to before the restores of callee-saved registers.
+ auto LastFrameDestroy = MBBI;
+ if (!CSI.empty())
+ LastFrameDestroy = std::prev(MBBI, CSI.size());
+
+ // Get the number of bytes from FrameInfo.
+ uint64_t StackSize = MFI.getStackSize();
+
+ // Restore the stack pointer.
+ if (RI->hasStackRealignment(MF) || MFI.hasVarSizedObjects()) {
+ assert(hasFP(MF) && "frame pointer should not have been eliminated");
+ adjustReg(MBB, LastFrameDestroy, DL, SPReg, LoongArch::R22, -StackSize,
+ MachineInstr::FrameDestroy);
+ }
+
+ // Deallocate stack
+ adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
+}
+
+void LoongArchFrameLowering::determineCalleeSaves(MachineFunction &MF,
+ BitVector &SavedRegs,
+ RegScavenger *RS) const {
+ TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+ // Unconditionally spill RA and FP only if the function uses a frame
+ // pointer.
+ if (hasFP(MF)) {
+ SavedRegs.set(LoongArch::R1);
+ SavedRegs.set(LoongArch::R22);
+ }
+ // Mark BP as used if function has dedicated base pointer.
+ if (hasBP(MF))
+ SavedRegs.set(LoongArchABI::getBPReg());
+}
+
+StackOffset LoongArchFrameLowering::getFrameIndexReference(
+ const MachineFunction &MF, int FI, Register &FrameReg) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+
+ // Callee-saved registers should be referenced relative to the stack
+ // pointer (positive offset), otherwise use the frame pointer (negative
+ // offset).
+ const auto &CSI = MFI.getCalleeSavedInfo();
+ int MinCSFI = 0;
+ int MaxCSFI = -1;
+ StackOffset Offset =
+ StackOffset::getFixed(MFI.getObjectOffset(FI) - getOffsetOfLocalArea() +
+ MFI.getOffsetAdjustment());
+
+ if (CSI.size()) {
+ MinCSFI = CSI[0].getFrameIdx();
+ MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+ }
+
+ FrameReg = RI->getFrameRegister(MF);
+ if ((FI >= MinCSFI && FI <= MaxCSFI) || !hasFP(MF)) {
+ FrameReg = LoongArch::R3;
+ Offset += StackOffset::getFixed(MFI.getStackSize());
+ }
+
+ return Offset;
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
index 25c53efc10f1..014b666de711 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
@@ -31,8 +31,26 @@ public:
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+ RegScavenger *RS) const override;
+
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override {
+ return MBB.erase(MI);
+ }
+
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
+
bool hasFP(const MachineFunction &MF) const override;
bool hasBP(const MachineFunction &MF) const;
+
+private:
+ void determineFrameLayout(MachineFunction &MF) const;
+ void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, Register DestReg, Register SrcReg,
+ int64_t Val, MachineInstr::MIFlag Flag) const;
};
} // namespace llvm
#endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHFRAMELOWERING_H
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
index cc9ea0255d98..bb40ff817574 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
@@ -33,13 +33,14 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) {
unsigned Opcode = Node->getOpcode();
MVT GRLenVT = Subtarget->getGRLenVT();
SDLoc DL(Node);
+ MVT VT = Node->getSimpleValueType(0);
switch (Opcode) {
default:
break;
case ISD::Constant: {
int64_t Imm = cast<ConstantSDNode>(Node)->getSExtValue();
- if (Imm == 0 && Node->getSimpleValueType(0) == GRLenVT) {
+ if (Imm == 0 && VT == GRLenVT) {
SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
LoongArch::R0, GRLenVT);
ReplaceNode(Node, New.getNode());
@@ -60,6 +61,15 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, Result);
return;
}
+ case ISD::FrameIndex: {
+ SDValue Imm = CurDAG->getTargetConstant(0, DL, GRLenVT);
+ int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+ SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
+ unsigned ADDIOp =
+ Subtarget->is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W;
+ ReplaceNode(Node, CurDAG->getMachineNode(ADDIOp, DL, VT, TFI, Imm));
+ return;
+ }
// TODO: Add selection nodes needed later.
}
@@ -67,6 +77,17 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) {
SelectCode(Node);
}
+bool LoongArchDAGToDAGISel::SelectBaseAddr(SDValue Addr, SDValue &Base) {
+ // If this is FrameIndex, select it directly. Otherwise just let it get
+ // selected to a register independently.
+ if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr))
+ Base =
+ CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getGRLenVT());
+ else
+ Base = Addr;
+ return true;
+}
+
bool LoongArchDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth,
SDValue &ShAmt) {
// Shift instructions on LoongArch only read the lower 5 or 6 bits of the
@@ -125,6 +146,39 @@ bool LoongArchDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth,
return true;
}
+bool LoongArchDAGToDAGISel::selectSExti32(SDValue N, SDValue &Val) {
+ if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+ cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
+ Val = N.getOperand(0);
+ return true;
+ }
+ MVT VT = N.getSimpleValueType();
+ if (CurDAG->ComputeNumSignBits(N) > (VT.getSizeInBits() - 32)) {
+ Val = N;
+ return true;
+ }
+
+ return false;
+}
+
+bool LoongArchDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) {
+ if (N.getOpcode() == ISD::AND) {
+ auto *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (C && C->getZExtValue() == UINT64_C(0xFFFFFFFF)) {
+ Val = N.getOperand(0);
+ return true;
+ }
+ }
+ MVT VT = N.getSimpleValueType();
+ APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), 32);
+ if (CurDAG->MaskedValueIsZero(N, Mask)) {
+ Val = N;
+ return true;
+ }
+
+ return false;
+}
+
// This pass converts a legalized DAG into a LoongArch-specific DAG, ready
// for instruction scheduling.
FunctionPass *llvm::createLoongArchISelDag(LoongArchTargetMachine &TM) {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
index f477129d933c..7ad329a64424 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
@@ -38,6 +38,8 @@ public:
void Select(SDNode *Node) override;
+ bool SelectBaseAddr(SDValue Addr, SDValue &Base);
+
bool selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt);
bool selectShiftMaskGRLen(SDValue N, SDValue &ShAmt) {
return selectShiftMask(N, Subtarget->getGRLen(), ShAmt);
@@ -46,6 +48,9 @@ public:
return selectShiftMask(N, 32, ShAmt);
}
+ bool selectSExti32(SDValue N, SDValue &Val);
+ bool selectZExti32(SDValue N, SDValue &Val);
+
// Include the pieces autogenerated from the target description.
#include "LoongArchGenDAGISel.inc"
};
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index d5a469216859..4acf90bd9788 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -17,14 +17,21 @@
#include "LoongArchRegisterInfo.h"
#include "LoongArchSubtarget.h"
#include "LoongArchTargetMachine.h"
+#include "MCTargetDesc/LoongArchMCTargetDesc.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
using namespace llvm;
#define DEBUG_TYPE "loongarch-isel-lowering"
+static cl::opt<bool> ZeroDivCheck(
+ "loongarch-check-zero-division", cl::Hidden,
+ cl::desc("Trap on integer division by zero."),
+ cl::init(false));
+
LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
const LoongArchSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
@@ -37,15 +44,25 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
if (Subtarget.hasBasicD())
addRegisterClass(MVT::f64, &LoongArch::FPR64RegClass);
+ setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, GRLenVT,
+ MVT::i1, Promote);
+
// TODO: add necessary setOperationAction calls later.
setOperationAction(ISD::SHL_PARTS, GRLenVT, Custom);
setOperationAction(ISD::SRA_PARTS, GRLenVT, Custom);
setOperationAction(ISD::SRL_PARTS, GRLenVT, Custom);
+ setOperationAction(ISD::FP_TO_SINT, GRLenVT, Custom);
+
+ setOperationAction({ISD::GlobalAddress, ISD::ConstantPool}, GRLenVT, Custom);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::SHL, MVT::i32, Custom);
setOperationAction(ISD::SRA, MVT::i32, Custom);
setOperationAction(ISD::SRL, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+ if (Subtarget.hasBasicF() && !Subtarget.hasBasicD())
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
}
static const ISD::CondCode FPCCToExpand[] = {ISD::SETOGT, ISD::SETOGE,
@@ -58,10 +75,19 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
if (Subtarget.hasBasicD()) {
setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
}
+ setOperationAction(ISD::BR_CC, GRLenVT, Expand);
setOperationAction(ISD::SELECT_CC, GRLenVT, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+ setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, GRLenVT, Expand);
+ if (!Subtarget.is64Bit())
+ setLibcallName(RTLIB::MUL_I128, nullptr);
+
+ setOperationAction(ISD::FP_TO_UINT, GRLenVT, Custom);
+ setOperationAction(ISD::UINT_TO_FP, GRLenVT, Custom);
// Compute derived properties from the register classes.
computeRegisterProperties(STI.getRegisterInfo());
@@ -70,11 +96,14 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setBooleanContents(ZeroOrOneBooleanContent);
+ setMaxAtomicSizeInBitsSupported(Subtarget.getGRLen());
+
// Function alignments.
const Align FunctionAlignment(4);
setMinFunctionAlignment(FunctionAlignment);
setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::SRL);
}
@@ -83,6 +112,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
switch (Op.getOpcode()) {
default:
report_fatal_error("unimplemented operand");
+ case ISD::GlobalAddress:
+ return lowerGlobalAddress(Op, DAG);
case ISD::SHL_PARTS:
return lowerShiftLeftParts(Op, DAG);
case ISD::SRA_PARTS:
@@ -96,7 +127,105 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
assert(Op.getOperand(1).getValueType() == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
return SDValue();
+ case ISD::ConstantPool:
+ return lowerConstantPool(Op, DAG);
+ case ISD::FP_TO_SINT:
+ return lowerFP_TO_SINT(Op, DAG);
+ case ISD::BITCAST:
+ return lowerBITCAST(Op, DAG);
+ case ISD::FP_TO_UINT:
+ return SDValue();
+ case ISD::UINT_TO_FP:
+ return lowerUINT_TO_FP(Op, DAG);
+ }
+}
+
+SDValue LoongArchTargetLowering::lowerUINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ SDLoc DL(Op);
+ auto &TLI = DAG.getTargetLoweringInfo();
+ SDValue Tmp1, Tmp2;
+ SDValue Op1 = Op.getOperand(0);
+ if (Op1->getOpcode() == ISD::AssertZext ||
+ Op1->getOpcode() == ISD::AssertSext)
+ return Op;
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op.getOperand(0));
+ SDValue Res = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f64, Trunc);
+ SDNode *N = Res.getNode();
+ TLI.expandUINT_TO_FP(N, Tmp1, Tmp2, DAG);
+ return Tmp1;
+}
+
+SDValue LoongArchTargetLowering::lowerBITCAST(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ SDLoc DL(Op);
+ SDValue Op0 = Op.getOperand(0);
+
+ if (Op.getValueType() == MVT::f32 && Op0.getValueType() == MVT::i32 &&
+ Subtarget.is64Bit() && Subtarget.hasBasicF()) {
+ SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
+ return DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64, DL, MVT::f32, NewOp0);
}
+ return Op;
+}
+
+SDValue LoongArchTargetLowering::lowerFP_TO_SINT(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ SDLoc DL(Op);
+
+ if (Op.getValueSizeInBits() > 32 && Subtarget.hasBasicF() &&
+ !Subtarget.hasBasicD()) {
+ SDValue Dst =
+ DAG.getNode(LoongArchISD::FTINT, DL, MVT::f32, Op.getOperand(0));
+ return DAG.getNode(LoongArchISD::MOVFR2GR_S_LA64, DL, MVT::i64, Dst);
+ }
+
+ EVT FPTy = EVT::getFloatingPointVT(Op.getValueSizeInBits());
+ SDValue Trunc = DAG.getNode(LoongArchISD::FTINT, DL, FPTy, Op.getOperand(0));
+ return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Trunc);
+}
+
+SDValue LoongArchTargetLowering::lowerConstantPool(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT Ty = Op.getValueType();
+ ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+
+ // FIXME: Only support PC-relative addressing to access the symbol.
+ // Target flags will be added later.
+ if (!isPositionIndependent()) {
+ SDValue ConstantN = DAG.getTargetConstantPool(
+ N->getConstVal(), Ty, N->getAlign(), N->getOffset());
+ SDValue AddrHi(DAG.getMachineNode(LoongArch::PCALAU12I, DL, Ty, ConstantN),
+ 0);
+ SDValue Addr(DAG.getMachineNode(Subtarget.is64Bit() ? LoongArch::ADDI_D
+ : LoongArch::ADDI_W,
+ DL, Ty, AddrHi, ConstantN),
+ 0);
+ return Addr;
+ }
+ report_fatal_error("Unable to lower ConstantPool");
+}
+
+SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT Ty = getPointerTy(DAG.getDataLayout());
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ unsigned ADDIOp = Subtarget.is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W;
+
+ // FIXME: Only support PC-relative addressing to access the symbol.
+ // TODO: Add target flags.
+ if (!isPositionIndependent()) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty);
+ SDValue AddrHi(DAG.getMachineNode(LoongArch::PCALAU12I, DL, Ty, GA), 0);
+ SDValue Addr(DAG.getMachineNode(ADDIOp, DL, Ty, AddrHi, GA), 0);
+ return Addr;
+ }
+ report_fatal_error("Unable to lowerGlobalAddress");
}
SDValue LoongArchTargetLowering::lowerShiftLeftParts(SDValue Op,
@@ -238,6 +367,36 @@ void LoongArchTargetLowering::ReplaceNodeResults(
break;
}
break;
+ case ISD::FP_TO_SINT: {
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+ SDValue Src = N->getOperand(0);
+ EVT VT = EVT::getFloatingPointVT(N->getValueSizeInBits(0));
+ SDValue Dst = DAG.getNode(LoongArchISD::FTINT, DL, VT, Src);
+ Results.push_back(DAG.getNode(ISD::BITCAST, DL, N->getValueType(0), Dst));
+ break;
+ }
+ case ISD::BITCAST: {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (VT == MVT::i32 && SrcVT == MVT::f32 && Subtarget.is64Bit() &&
+ Subtarget.hasBasicF()) {
+ SDValue Dst =
+ DAG.getNode(LoongArchISD::MOVFR2GR_S_LA64, DL, MVT::i64, Src);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Dst));
+ }
+ break;
+ }
+ case ISD::FP_TO_UINT: {
+ assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+ "Unexpected custom legalisation");
+ auto &TLI = DAG.getTargetLoweringInfo();
+ SDValue Tmp1, Tmp2;
+ TLI.expandFP_TO_UINT(N, Tmp1, Tmp2, DAG);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Tmp1));
+ break;
+ }
}
}
@@ -345,6 +504,224 @@ static SDValue performSRLCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const LoongArchSubtarget &Subtarget) {
+ MVT GRLenVT = Subtarget.getGRLenVT();
+ EVT ValTy = N->getValueType(0);
+ SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+ ConstantSDNode *CN0, *CN1;
+ SDLoc DL(N);
+ unsigned ValBits = ValTy.getSizeInBits();
+ unsigned MaskIdx0, MaskLen0, MaskIdx1, MaskLen1;
+ unsigned Shamt;
+ bool SwapAndRetried = false;
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (ValBits != 32 && ValBits != 64)
+ return SDValue();
+
+Retry:
+ // 1st pattern to match BSTRINS:
+ // R = or (and X, mask0), (and (shl Y, lsb), mask1)
+ // where mask1 = (2**size - 1) << lsb, mask0 = ~mask1
+ // =>
+ // R = BSTRINS X, Y, msb, lsb (where msb = lsb + size - 1)
+ if (N0.getOpcode() == ISD::AND &&
+ (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) &&
+ isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) &&
+ N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL &&
+ (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(1))) &&
+ isShiftedMask_64(CN1->getZExtValue(), MaskIdx1, MaskLen1) &&
+ MaskIdx0 == MaskIdx1 && MaskLen0 == MaskLen1 &&
+ (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) &&
+ (Shamt = CN1->getZExtValue()) == MaskIdx0 &&
+ (MaskIdx0 + MaskLen0 <= ValBits)) {
+ LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 1\n");
+ return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
+ N1.getOperand(0).getOperand(0),
+ DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT),
+ DAG.getConstant(MaskIdx0, DL, GRLenVT));
+ }
+
+ // 2nd pattern to match BSTRINS:
+ // R = or (and X, mask0), (shl (and Y, mask1), lsb)
+ // where mask1 = (2**size - 1), mask0 = ~(mask1 << lsb)
+ // =>
+ // R = BSTRINS X, Y, msb, lsb (where msb = lsb + size - 1)
+ if (N0.getOpcode() == ISD::AND &&
+ (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) &&
+ isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) &&
+ N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::AND &&
+ (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(1))) &&
+ (Shamt = CN1->getZExtValue()) == MaskIdx0 &&
+ (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) &&
+ isShiftedMask_64(CN1->getZExtValue(), MaskIdx1, MaskLen1) &&
+ MaskLen0 == MaskLen1 && MaskIdx1 == 0 &&
+ (MaskIdx0 + MaskLen0 <= ValBits)) {
+ LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 2\n");
+ return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
+ N1.getOperand(0).getOperand(0),
+ DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT),
+ DAG.getConstant(MaskIdx0, DL, GRLenVT));
+ }
+
+ // 3rd pattern to match BSTRINS:
+ // R = or (and X, mask0), (and Y, mask1)
+ // where ~mask0 = (2**size - 1) << lsb, mask0 & mask1 = 0
+ // =>
+ // R = BSTRINS X, (shr (and Y, mask1), lsb), msb, lsb
+ // where msb = lsb + size - 1
+ if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
+ (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) &&
+ isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) &&
+ (MaskIdx0 + MaskLen0 <= 64) &&
+ (CN1 = dyn_cast<ConstantSDNode>(N1->getOperand(1))) &&
+ (CN1->getSExtValue() & CN0->getSExtValue()) == 0) {
+ LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 3\n");
+ return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
+ DAG.getNode(ISD::SRL, DL, N1->getValueType(0), N1,
+ DAG.getConstant(MaskIdx0, DL, GRLenVT)),
+ DAG.getConstant(ValBits == 32
+ ? (MaskIdx0 + (MaskLen0 & 31) - 1)
+ : (MaskIdx0 + MaskLen0 - 1),
+ DL, GRLenVT),
+ DAG.getConstant(MaskIdx0, DL, GRLenVT));
+ }
+
+ // 4th pattern to match BSTRINS:
+ // R = or (and X, mask), (shl Y, shamt)
+ // where mask = (2**shamt - 1)
+ // =>
+ // R = BSTRINS X, Y, ValBits - 1, shamt
+ // where ValBits = 32 or 64
+ if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::SHL &&
+ (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) &&
+ isShiftedMask_64(CN0->getZExtValue(), MaskIdx0, MaskLen0) &&
+ MaskIdx0 == 0 && (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(1))) &&
+ (Shamt = CN1->getZExtValue()) == MaskLen0 &&
+ (MaskIdx0 + MaskLen0 <= ValBits)) {
+ LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 4\n");
+ return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
+ N1.getOperand(0),
+ DAG.getConstant((ValBits - 1), DL, GRLenVT),
+ DAG.getConstant(Shamt, DL, GRLenVT));
+ }
+
+ // 5th pattern to match BSTRINS:
+ // R = or (and X, mask), const
+ // where ~mask = (2**size - 1) << lsb, mask & const = 0
+ // =>
+ // R = BSTRINS X, (const >> lsb), msb, lsb
+ // where msb = lsb + size - 1
+ if (N0.getOpcode() == ISD::AND &&
+ (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) &&
+ isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) &&
+ (CN1 = dyn_cast<ConstantSDNode>(N1)) &&
+ (CN1->getSExtValue() & CN0->getSExtValue()) == 0) {
+ LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 5\n");
+ return DAG.getNode(
+ LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
+ DAG.getConstant(CN1->getSExtValue() >> MaskIdx0, DL, ValTy),
+ DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT),
+ DAG.getConstant(MaskIdx0, DL, GRLenVT));
+ }
+
+ // 6th pattern.
+ // a = b | ((c & mask) << shamt), where all positions in b to be overwritten
+ // by the incoming bits are known to be zero.
+ // =>
+ // a = BSTRINS b, c, shamt + MaskLen - 1, shamt
+ //
+ // Note that the 1st pattern is a special situation of the 6th, i.e. the 6th
+ // pattern is more common than the 1st. So we put the 1st before the 6th in
+ // order to match as many nodes as possible.
+ ConstantSDNode *CNMask, *CNShamt;
+ unsigned MaskIdx, MaskLen;
+ if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::AND &&
+ (CNMask = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) &&
+ isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen) &&
+ MaskIdx == 0 && (CNShamt = dyn_cast<ConstantSDNode>(N1.getOperand(1))) &&
+ CNShamt->getZExtValue() + MaskLen <= ValBits) {
+ Shamt = CNShamt->getZExtValue();
+ APInt ShMask(ValBits, CNMask->getZExtValue() << Shamt);
+ if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) {
+ LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 6\n");
+ return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0,
+ N1.getOperand(0).getOperand(0),
+ DAG.getConstant(Shamt + MaskLen - 1, DL, GRLenVT),
+ DAG.getConstant(Shamt, DL, GRLenVT));
+ }
+ }
+
+ // 7th pattern.
+ // a = b | ((c << shamt) & shifted_mask), where all positions in b to be
+ // overwritten by the incoming bits are known to be zero.
+ // =>
+ // a = BSTRINS b, c, MaskIdx + MaskLen - 1, MaskIdx
+ //
+ // Similarly, the 7th pattern is more common than the 2nd. So we put the 2nd
+ // before the 7th in order to match as many nodes as possible.
+ if (N1.getOpcode() == ISD::AND &&
+ (CNMask = dyn_cast<ConstantSDNode>(N1.getOperand(1))) &&
+ isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen) &&
+ N1.getOperand(0).getOpcode() == ISD::SHL &&
+ (CNShamt = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) &&
+ CNShamt->getZExtValue() == MaskIdx) {
+ APInt ShMask(ValBits, CNMask->getZExtValue());
+ if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) {
+ LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 7\n");
+ return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0,
+ N1.getOperand(0).getOperand(0),
+ DAG.getConstant(MaskIdx + MaskLen - 1, DL, GRLenVT),
+ DAG.getConstant(MaskIdx, DL, GRLenVT));
+ }
+ }
+
+ // (or a, b) and (or b, a) are equivalent, so swap the operands and retry.
+ if (!SwapAndRetried) {
+ std::swap(N0, N1);
+ SwapAndRetried = true;
+ goto Retry;
+ }
+
+ SwapAndRetried = false;
+Retry2:
+ // 8th pattern.
+ // a = b | (c & shifted_mask), where all positions in b to be overwritten by
+ // the incoming bits are known to be zero.
+ // =>
+ // a = BSTRINS b, c >> MaskIdx, MaskIdx + MaskLen - 1, MaskIdx
+ //
+ // Similarly, the 8th pattern is more common than the 4th and 5th patterns. So
+ // we put it here in order to match as many nodes as possible or generate less
+ // instructions.
+ if (N1.getOpcode() == ISD::AND &&
+ (CNMask = dyn_cast<ConstantSDNode>(N1.getOperand(1))) &&
+ isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen)) {
+ APInt ShMask(ValBits, CNMask->getZExtValue());
+ if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) {
+ LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 8\n");
+ return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0,
+ DAG.getNode(ISD::SRL, DL, N1->getValueType(0),
+ N1->getOperand(0),
+ DAG.getConstant(MaskIdx, DL, GRLenVT)),
+ DAG.getConstant(MaskIdx + MaskLen - 1, DL, GRLenVT),
+ DAG.getConstant(MaskIdx, DL, GRLenVT));
+ }
+ }
+ // Swap N0/N1 and retry.
+ if (!SwapAndRetried) {
+ std::swap(N0, N1);
+ SwapAndRetried = true;
+ goto Retry2;
+ }
+
+ return SDValue();
+}
+
SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -353,12 +730,62 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
break;
case ISD::AND:
return performANDCombine(N, DAG, DCI, Subtarget);
+ case ISD::OR:
+ return performORCombine(N, DAG, DCI, Subtarget);
case ISD::SRL:
return performSRLCombine(N, DAG, DCI, Subtarget);
}
return SDValue();
}
+static MachineBasicBlock *insertDivByZeroTrap(MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ const TargetInstrInfo &TII) {
+ if (!ZeroDivCheck)
+ return &MBB;
+
+ // Build instructions:
+ // div(or mod) $dst, $dividend, $divisor
+ // bnez $divisor, 8
+ // break 7
+ // fallthrough
+ MachineOperand &Divisor = MI.getOperand(2);
+ auto FallThrough = std::next(MI.getIterator());
+
+ BuildMI(MBB, FallThrough, MI.getDebugLoc(), TII.get(LoongArch::BNEZ))
+ .addReg(Divisor.getReg(), getKillRegState(Divisor.isKill()))
+ .addImm(8);
+
+ // See linux header file arch/loongarch/include/uapi/asm/break.h for the
+ // definition of BRK_DIVZERO.
+ BuildMI(MBB, FallThrough, MI.getDebugLoc(), TII.get(LoongArch::BREAK))
+ .addImm(7/*BRK_DIVZERO*/);
+
+ // Clear Divisor's kill flag.
+ Divisor.setIsKill(false);
+
+ return &MBB;
+}
+
+MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter(
+ MachineInstr &MI, MachineBasicBlock *BB) const {
+
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instr type to insert");
+ case LoongArch::DIV_W:
+ case LoongArch::DIV_WU:
+ case LoongArch::MOD_W:
+ case LoongArch::MOD_WU:
+ case LoongArch::DIV_D:
+ case LoongArch::DIV_DU:
+ case LoongArch::MOD_D:
+ case LoongArch::MOD_DU:
+ return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo());
+ break;
+ }
+}
+
const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((LoongArchISD::NodeType)Opcode) {
case LoongArchISD::FIRST_NUMBER:
@@ -369,11 +796,16 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
return "LoongArchISD::" #node;
// TODO: Add more target-dependent nodes later.
+ NODE_NAME_CASE(CALL)
NODE_NAME_CASE(RET)
NODE_NAME_CASE(SLL_W)
NODE_NAME_CASE(SRA_W)
NODE_NAME_CASE(SRL_W)
+ NODE_NAME_CASE(BSTRINS)
NODE_NAME_CASE(BSTRPICK)
+ NODE_NAME_CASE(MOVGR2FR_W_LA64)
+ NODE_NAME_CASE(MOVFR2GR_S_LA64)
+ NODE_NAME_CASE(FTINT)
}
#undef NODE_NAME_CASE
return nullptr;
@@ -483,6 +915,132 @@ SDValue LoongArchTargetLowering::LowerFormalArguments(
return Chain;
}
+// Lower a call to a callseq_start + CALL + callseq_end chain, and add input
+// and output parameter nodes.
+SDValue
+LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc &DL = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ CLI.IsTailCall = false;
+
+ if (IsVarArg)
+ report_fatal_error("LowerCall with varargs not implemented");
+
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // Analyze the operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign> ArgLocs;
+ CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+ analyzeOutputArgs(ArgCCInfo, Outs, CC_LoongArch);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = ArgCCInfo.getNextStackOffset();
+
+ for (auto &Arg : Outs) {
+ if (!Arg.Flags.isByVal())
+ continue;
+ report_fatal_error("Passing arguments byval not implemented");
+ }
+
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
+
+ // Copy argument values to their designated locations.
+ SmallVector<std::pair<Register, SDValue>> RegsToPass;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue ArgValue = OutVals[i];
+
+ // Promote the value if needed.
+ // For now, only handle fully promoted arguments.
+ if (VA.getLocInfo() != CCValAssign::Full)
+ report_fatal_error("Unknown loc info");
+
+ if (VA.isRegLoc()) {
+ // Queue up the argument copies and emit them at the end.
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
+ } else {
+ report_fatal_error("Passing arguments via the stack not implemented");
+ }
+ }
+
+ SDValue Glue;
+
+ // Build a sequence of copy-to-reg nodes, chained and glued together.
+ for (auto &Reg : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue);
+ Glue = Chain.getValue(1);
+ }
+
+ // If the callee is a GlobalAddress/ExternalSymbol node, turn it into a
+ // TargetGlobalAddress/TargetExternalSymbol node so that legalize won't
+ // split it and then direct call can be matched by PseudoCALL.
+ // FIXME: Add target flags for relocation.
+ if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee))
+ Callee = DAG.getTargetGlobalAddress(S->getGlobal(), DL, PtrVT);
+ else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT);
+
+ // The first call operand is the chain and the second is the target address.
+ SmallVector<SDValue> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add argument registers to the end of the list so that they are
+ // known live into the call.
+ for (auto &Reg : RegsToPass)
+ Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+
+ // Add a register mask operand representing the call-preserved registers.
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ // Glue the call to the argument copies, if any.
+ if (Glue.getNode())
+ Ops.push_back(Glue);
+
+ // Emit the call.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops);
+ DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
+ Glue = Chain.getValue(1);
+
+ // Mark the end of the call, which is glued to the call itself.
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, DL, PtrVT, true),
+ DAG.getConstant(0, DL, PtrVT, true), Glue, DL);
+ Glue = Chain.getValue(1);
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign> RVLocs;
+ CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+ analyzeInputArgs(RetCCInfo, Ins, CC_LoongArch);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (auto &VA : RVLocs) {
+ // Copy the value out.
+ SDValue RetValue =
+ DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue);
+ Chain = RetValue.getValue(1);
+ Glue = RetValue.getValue(2);
+
+ InVals.push_back(Chain.getValue(0));
+ }
+
+ return Chain;
+}
+
bool LoongArchTargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
@@ -529,3 +1087,14 @@ SDValue LoongArchTargetLowering::LowerReturn(
return DAG.getNode(LoongArchISD::RET, DL, MVT::Other, RetOps);
}
+
+bool LoongArchTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const {
+ assert((VT == MVT::f32 || VT == MVT::f64) && "Unexpected VT");
+
+ if (VT == MVT::f32 && !Subtarget.hasBasicF())
+ return false;
+ if (VT == MVT::f64 && !Subtarget.hasBasicD())
+ return false;
+ return (Imm.isZero() || Imm.isExactlyValue(+1.0));
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index c852577a3744..279550482675 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -27,6 +27,7 @@ enum NodeType : unsigned {
FIRST_NUMBER = ISD::BUILTIN_OP_END,
// TODO: add more LoongArchISDs
+ CALL,
RET,
// 32-bit shifts, directly matching the semantics of the named LoongArch
// instructions.
@@ -34,6 +35,13 @@ enum NodeType : unsigned {
SRA_W,
SRL_W,
+ // FPR<->GPR transfer operations
+ MOVGR2FR_W_LA64,
+ MOVFR2GR_S_LA64,
+
+ FTINT,
+
+ BSTRINS,
BSTRPICK,
};
@@ -72,6 +80,8 @@ public:
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
+ SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
private:
/// Target-specific function used to lower LoongArch calling conventions.
@@ -86,8 +96,24 @@ private:
const SmallVectorImpl<ISD::OutputArg> &Outs,
LoongArchCCAssignFn Fn) const;
+ SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *BB) const override;
+ SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
+
+ bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+ return isa<LoadInst>(I) || isa<StoreInst>(I);
+ }
};
} // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 146ef53befd5..bcbd4b28f3c7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -12,6 +12,7 @@
#include "LoongArchInstrInfo.h"
#include "LoongArch.h"
+#include "LoongArchMachineFunctionInfo.h"
using namespace llvm;
@@ -19,8 +20,8 @@ using namespace llvm;
#include "LoongArchGenInstrInfo.inc"
LoongArchInstrInfo::LoongArchInstrInfo(LoongArchSubtarget &STI)
- // FIXME: add CFSetup and CFDestroy Inst when we implement function call.
- : LoongArchGenInstrInfo() {}
+ : LoongArchGenInstrInfo(LoongArch::ADJCALLSTACKDOWN,
+ LoongArch::ADJCALLSTACKUP) {}
void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
@@ -47,3 +48,68 @@ void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
.addReg(SrcReg, getKillRegState(KillSrc));
}
+
+void LoongArchInstrInfo::storeRegToStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg,
+ bool IsKill, int FI, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (I != MBB.end())
+ DL = I->getDebugLoc();
+ MachineFunction *MF = MBB.getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+
+ unsigned Opcode;
+ if (LoongArch::GPRRegClass.hasSubClassEq(RC))
+ Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32
+ ? LoongArch::ST_W
+ : LoongArch::ST_D;
+ else if (LoongArch::FPR32RegClass.hasSubClassEq(RC))
+ Opcode = LoongArch::FST_S;
+ else if (LoongArch::FPR64RegClass.hasSubClassEq(RC))
+ Opcode = LoongArch::FST_D;
+ else
+ llvm_unreachable("Can't store this register to stack slot");
+
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+
+ BuildMI(MBB, I, DL, get(Opcode))
+ .addReg(SrcReg, getKillRegState(IsKill))
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO);
+}
+
+void LoongArchInstrInfo::loadRegFromStackSlot(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg,
+ int FI, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL;
+ if (I != MBB.end())
+ DL = I->getDebugLoc();
+ MachineFunction *MF = MBB.getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+
+ unsigned Opcode;
+ if (LoongArch::GPRRegClass.hasSubClassEq(RC))
+ Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32
+ ? LoongArch::LD_W
+ : LoongArch::LD_D;
+ else if (LoongArch::FPR32RegClass.hasSubClassEq(RC))
+ Opcode = LoongArch::FLD_S;
+ else if (LoongArch::FPR64RegClass.hasSubClassEq(RC))
+ Opcode = LoongArch::FLD_D;
+ else
+ llvm_unreachable("Can't load this register from stack slot");
+
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+
+ BuildMI(MBB, I, DL, get(Opcode), DstReg)
+ .addFrameIndex(FI)
+ .addImm(0)
+ .addMemOperand(MMO);
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index f31943b85a51..0a8c86a5e0c2 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -30,6 +30,16 @@ public:
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, MCRegister DstReg, MCRegister SrcReg,
bool KillSrc) const override;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, Register SrcReg,
+ bool IsKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, Register DstReg,
+ int FrameIndex, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const override;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 6b8ee9e43f94..d07d086bd7da 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -14,22 +14,45 @@
// LoongArch specific DAG Nodes.
//===----------------------------------------------------------------------===//
+// Target-independent type requirements, but with target-specific formats.
+def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+
// Target-dependent type requirements.
+def SDT_LoongArchCall : SDTypeProfile<0, -1, [SDTCisVT<0, GRLenVT>]>;
def SDT_LoongArchIntBinOpW : SDTypeProfile<1, 2, [
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>
]>;
+def SDT_LoongArchBStrIns: SDTypeProfile<1, 4, [
+ SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>,
+ SDTCisSameAs<3, 4>
+]>;
+
def SDT_LoongArchBStrPick: SDTypeProfile<1, 3, [
SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<2, 3>
]>;
// TODO: Add LoongArch specific DAG Nodes
+// Target-independent nodes, but with target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
+ [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
// Target-dependent nodes.
+def loongarch_call : SDNode<"LoongArchISD::CALL", SDT_LoongArchCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
+def loongarch_bstrins
+ : SDNode<"LoongArchISD::BSTRINS", SDT_LoongArchBStrIns>;
def loongarch_bstrpick
: SDNode<"LoongArchISD::BSTRPICK", SDT_LoongArchBStrPick>;
@@ -106,7 +129,14 @@ def simm16 : Operand<GRLenVT> {
let DecoderMethod = "decodeSImmOperand<16>";
}
-def simm16_lsl2 : Operand<GRLenVT> {
+def simm16_lsl2 : Operand<GRLenVT>,
+ ImmLeaf<GRLenVT, [{return isInt<16>(Imm>>2);}]> {
+ let ParserMatchClass = SImmAsmOperand<16, "lsl2">;
+ let EncoderMethod = "getImmOpValueAsr2";
+ let DecoderMethod = "decodeSImmOperand<16, 2>";
+}
+
+def simm16_lsl2_br : Operand<OtherVT> {
let ParserMatchClass = SImmAsmOperand<16, "lsl2">;
let EncoderMethod = "getImmOpValueAsr2";
let DecoderMethod = "decodeSImmOperand<16, 2>";
@@ -117,13 +147,13 @@ def simm20 : Operand<GRLenVT> {
let DecoderMethod = "decodeSImmOperand<20>";
}
-def simm21_lsl2 : Operand<GRLenVT> {
+def simm21_lsl2 : Operand<OtherVT> {
let ParserMatchClass = SImmAsmOperand<21, "lsl2">;
let EncoderMethod = "getImmOpValueAsr2";
let DecoderMethod = "decodeSImmOperand<21, 2>";
}
-def simm26_lsl2 : Operand<GRLenVT> {
+def simm26_lsl2 : Operand<OtherVT> {
let ParserMatchClass = SImmAsmOperand<26, "lsl2">;
let EncoderMethod = "getImmOpValueAsr2";
let DecoderMethod = "decodeSImmOperand<26, 2>";
@@ -141,6 +171,24 @@ def NegImm : SDNodeXForm<imm, [{
N->getValueType(0));
}]>;
+// FP immediate patterns.
+def fpimm0 : PatLeaf<(fpimm), [{return N->isExactlyValue(+0.0);}]>;
+def fpimm0neg : PatLeaf<(fpimm), [{return N->isExactlyValue(-0.0);}]>;
+def fpimm1 : PatLeaf<(fpimm), [{return N->isExactlyValue(+1.0);}]>;
+
+def CallSymbol: AsmOperandClass {
+ let Name = "CallSymbol";
+ let RenderMethod = "addImmOperands";
+ let PredicateMethod = "isImm";
+}
+
+// A bare symbol used in call only.
+def call_symbol : Operand<iPTR> {
+ let ParserMatchClass = CallSymbol;
+}
+
+def BaseAddr : ComplexPattern<iPTR, 1, "SelectBaseAddr">;
+
//===----------------------------------------------------------------------===//
// Instruction Formats
//===----------------------------------------------------------------------===//
@@ -185,7 +233,7 @@ class RDTIME_2R<bits<22> op, string opstr>
: Fmt2R<op, (outs GPR:$rd, GPR:$rj), (ins), opstr, "$rd, $rj">;
class BrCC_2RI16<bits<6> op, string opstr>
- : Fmt2RI16<op, (outs), (ins GPR:$rj, GPR:$rd, simm16_lsl2:$imm16), opstr,
+ : Fmt2RI16<op, (outs), (ins GPR:$rj, GPR:$rd, simm16_lsl2_br:$imm16), opstr,
"$rj, $rd, $imm16"> {
let isBranch = 1;
let isTerminator = 1;
@@ -274,10 +322,12 @@ def XORI : ALU_2RI12<0b0000001111, "xori", uimm12>;
def MUL_W : ALU_3R<0b00000000000111000, "mul.w">;
def MULH_W : ALU_3R<0b00000000000111001, "mulh.w">;
def MULH_WU : ALU_3R<0b00000000000111010, "mulh.wu">;
+let usesCustomInserter = true in {
def DIV_W : ALU_3R<0b00000000001000000, "div.w">;
def MOD_W : ALU_3R<0b00000000001000001, "mod.w">;
def DIV_WU : ALU_3R<0b00000000001000010, "div.wu">;
def MOD_WU : ALU_3R<0b00000000001000011, "mod.wu">;
+} // usesCustomInserter = true
// Bit-shift Instructions
def SLL_W : ALU_3R<0b00000000000101110, "sll.w">;
@@ -379,10 +429,12 @@ def MULH_D : ALU_3R<0b00000000000111100, "mulh.d">;
def MULH_DU : ALU_3R<0b00000000000111101, "mulh.du">;
def MULW_D_W : ALU_3R<0b00000000000111110, "mulw.d.w">;
def MULW_D_WU : ALU_3R<0b00000000000111111, "mulw.d.wu">;
+let usesCustomInserter = true in {
def DIV_D : ALU_3R<0b00000000001000100, "div.d">;
def MOD_D : ALU_3R<0b00000000001000101, "mod.d">;
def DIV_DU : ALU_3R<0b00000000001000110, "div.du">;
def MOD_DU : ALU_3R<0b00000000001000111, "mod.du">;
+} // usesCustomInserter = true
// Bit-shift Instructions for 64-bits
def SLL_D : ALU_3R<0b00000000000110001, "sll.d">;
@@ -545,6 +597,9 @@ def shiftMaskGRLen
: ComplexPattern<GRLenVT, 1, "selectShiftMaskGRLen", [], [], 0>;
def shiftMask32 : ComplexPattern<i64, 1, "selectShiftMask32", [], [], 0>;
+def sexti32 : ComplexPattern<i64, 1, "selectSExti32">;
+def zexti32 : ComplexPattern<i64, 1, "selectZExti32">;
+
class shiftop<SDPatternOperator operator>
: PatFrag<(ops node:$val, node:$count),
(operator node:$val, (GRLenVT (shiftMaskGRLen node:$count)))>;
@@ -556,6 +611,13 @@ let Predicates = [IsLA32] in {
def : PatGprGpr<add, ADD_W>;
def : PatGprImm<add, ADDI_W, simm12>;
def : PatGprGpr<sub, SUB_W>;
+def : PatGprGpr<sdiv, DIV_W>;
+def : PatGprGpr<udiv, DIV_WU>;
+def : PatGprGpr<srem, MOD_W>;
+def : PatGprGpr<urem, MOD_WU>;
+def : PatGprGpr<mul, MUL_W>;
+def : PatGprGpr<mulhs, MULH_W>;
+def : PatGprGpr<mulhu, MULH_WU>;
} // Predicates = [IsLA32]
let Predicates = [IsLA64] in {
@@ -565,6 +627,24 @@ def : PatGprImm<add, ADDI_D, simm12>;
def : PatGprImm_32<add, ADDI_W, simm12>;
def : PatGprGpr<sub, SUB_D>;
def : PatGprGpr_32<sub, SUB_W>;
+def : PatGprGpr<sdiv, DIV_D>;
+def : PatGprGpr<udiv, DIV_DU>;
+def : PatGprGpr<srem, MOD_D>;
+def : PatGprGpr<urem, MOD_DU>;
+// TODO: Select "_W[U]" instructions for i32xi32 if only lower 32 bits of the
+// product are used.
+def : PatGprGpr<mul, MUL_D>;
+def : PatGprGpr<mulhs, MULH_D>;
+def : PatGprGpr<mulhu, MULH_DU>;
+// Select MULW_D_W for calculating the full 64 bits product of i32xi32 signed
+// multiplication.
+def : Pat<(i64 (mul (sext_inreg GPR:$rj, i32), (sext_inreg GPR:$rk, i32))),
+ (MULW_D_W GPR:$rj, GPR:$rk)>;
+// Select MULW_D_WU for calculating the full 64 bits product of i32xi32
+// unsigned multiplication.
+def : Pat<(i64 (mul (loongarch_bstrpick GPR:$rj, (i64 31), (i64 0)),
+ (loongarch_bstrpick GPR:$rk, (i64 31), (i64 0)))),
+ (MULW_D_WU GPR:$rj, GPR:$rk)>;
} // Predicates = [IsLA64]
def : PatGprGpr<and, AND>;
@@ -649,19 +729,143 @@ def : Pat<(select GPR:$cond, GPR:$t, GPR:$f),
/// Branches and jumps
+class BccPat<PatFrag CondOp, LAInst Inst>
+ : Pat<(brcond (GRLenVT (CondOp GPR:$rj, GPR:$rd)), bb:$imm16),
+ (Inst GPR:$rj, GPR:$rd, bb:$imm16)>;
+
+def : BccPat<seteq, BEQ>;
+def : BccPat<setne, BNE>;
+def : BccPat<setlt, BLT>;
+def : BccPat<setge, BGE>;
+def : BccPat<setult, BLTU>;
+def : BccPat<setuge, BGEU>;
+
+class BccSwapPat<PatFrag CondOp, LAInst InstBcc>
+ : Pat<(brcond (GRLenVT (CondOp GPR:$rd, GPR:$rj)), bb:$imm16),
+ (InstBcc GPR:$rj, GPR:$rd, bb:$imm16)>;
+
+// Condition codes that don't have matching LoongArch branch instructions, but
+// are trivially supported by swapping the two input operands.
+def : BccSwapPat<setgt, BLT>;
+def : BccSwapPat<setle, BGE>;
+def : BccSwapPat<setugt, BLTU>;
+def : BccSwapPat<setule, BGEU>;
+
+// An extra pattern is needed for a brcond without a setcc (i.e. where the
+// condition was calculated elsewhere).
+def : Pat<(brcond GPR:$rj, bb:$imm21), (BNEZ GPR:$rj, bb:$imm21)>;
+
+let isBarrier = 1, isBranch = 1, isTerminator = 1 in
+def PseudoBR : Pseudo<(outs), (ins simm26_lsl2:$imm26), [(br bb:$imm26)]>,
+ PseudoInstExpansion<(B simm26_lsl2:$imm26)>;
+
+let isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in
+def PseudoBRIND : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16), []>,
+ PseudoInstExpansion<(JIRL R0, GPR:$rj, simm16_lsl2:$imm16)>;
+
+def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>;
+def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)),
+ (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>;
+
+let isCall = 1, Defs = [R1] in
+def PseudoCALL : Pseudo<(outs), (ins call_symbol:$func), []> {
+ let AsmString = "bl\t$func";
+}
+
+def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
+def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
+
+let isCall = 1, Defs = [R1] in
+def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rj),
+ [(loongarch_call GPR:$rj)]>,
+ PseudoInstExpansion<(JIRL R1, GPR:$rj, 0)>;
+
let isBarrier = 1, isReturn = 1, isTerminator = 1 in
def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>,
PseudoInstExpansion<(JIRL R0, R1, 0)>;
-/// BSTRPICK
+/// BSTRINS and BSTRPICK
-let Predicates = [IsLA32] in
+let Predicates = [IsLA32] in {
+def : Pat<(loongarch_bstrins GPR:$rd, GPR:$rj, uimm5:$msbd, uimm5:$lsbd),
+ (BSTRINS_W GPR:$rd, GPR:$rj, uimm5:$msbd, uimm5:$lsbd)>;
def : Pat<(loongarch_bstrpick GPR:$rj, uimm5:$msbd, uimm5:$lsbd),
(BSTRPICK_W GPR:$rj, uimm5:$msbd, uimm5:$lsbd)>;
+} // Predicates = [IsLA32]
-let Predicates = [IsLA64] in
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_bstrins GPR:$rd, GPR:$rj, uimm6:$msbd, uimm6:$lsbd),
+ (BSTRINS_D GPR:$rd, GPR:$rj, uimm6:$msbd, uimm6:$lsbd)>;
def : Pat<(loongarch_bstrpick GPR:$rj, uimm6:$msbd, uimm6:$lsbd),
(BSTRPICK_D GPR:$rj, uimm6:$msbd, uimm6:$lsbd)>;
+} // Predicates = [IsLA64]
+
+/// Loads
+
+multiclass LdPat<PatFrag LoadOp, LAInst Inst, ValueType vt = GRLenVT> {
+ def : Pat<(vt (LoadOp BaseAddr:$rj)), (Inst BaseAddr:$rj, 0)>;
+ def : Pat<(vt (LoadOp (add BaseAddr:$rj, simm12:$imm12))),
+ (Inst BaseAddr:$rj, simm12:$imm12)>;
+}
+
+defm : LdPat<sextloadi8, LD_B>;
+defm : LdPat<extloadi8, LD_B>;
+defm : LdPat<sextloadi16, LD_H>;
+defm : LdPat<extloadi16, LD_H>;
+defm : LdPat<load, LD_W>, Requires<[IsLA32]>;
+defm : LdPat<zextloadi8, LD_BU>;
+defm : LdPat<zextloadi16, LD_HU>;
+let Predicates = [IsLA64] in {
+defm : LdPat<sextloadi32, LD_W, i64>;
+defm : LdPat<extloadi32, LD_W, i64>;
+defm : LdPat<zextloadi32, LD_WU, i64>;
+defm : LdPat<load, LD_D, i64>;
+} // Predicates = [IsLA64]
+
+/// Stores
+
+multiclass StPat<PatFrag StoreOp, LAInst Inst, RegisterClass StTy,
+ ValueType vt> {
+ def : Pat<(StoreOp (vt StTy:$rd), BaseAddr:$rj),
+ (Inst StTy:$rd, BaseAddr:$rj, 0)>;
+ def : Pat<(StoreOp (vt StTy:$rd), (add BaseAddr:$rj, simm12:$imm12)),
+ (Inst StTy:$rd, BaseAddr:$rj, simm12:$imm12)>;
+}
+
+defm : StPat<truncstorei8, ST_B, GPR, GRLenVT>;
+defm : StPat<truncstorei16, ST_H, GPR, GRLenVT>;
+defm : StPat<store, ST_W, GPR, i32>, Requires<[IsLA32]>;
+let Predicates = [IsLA64] in {
+defm : StPat<truncstorei32, ST_W, GPR, i64>;
+defm : StPat<store, ST_D, GPR, i64>;
+} // Predicates = [IsLA64]
+
+/// Atomic loads and stores
+
+def : Pat<(atomic_fence timm, timm), (DBAR 0)>;
+
+defm : LdPat<atomic_load_8, LD_B>;
+defm : LdPat<atomic_load_16, LD_H>;
+defm : LdPat<atomic_load_32, LD_W>;
+
+defm : StPat<atomic_store_8, ST_B, GPR, GRLenVT>;
+defm : StPat<atomic_store_16, ST_H, GPR, GRLenVT>;
+defm : StPat<atomic_store_32, ST_W, GPR, i32>, Requires<[IsLA32]>;
+let Predicates = [IsLA64] in {
+defm : LdPat<atomic_load_64, LD_D>;
+defm : StPat<atomic_store_32, ST_W, GPR, i64>;
+defm : StPat<atomic_store_64, ST_D, GPR, i64>;
+} // Predicates = [IsLA64]
+
+/// Other pseudo-instructions
+
+// Pessimistically assume the stack pointer will be clobbered
+let Defs = [R3], Uses = [R3] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(callseq_end timm:$amt1, timm:$amt2)]>;
+} // Defs = [R3], Uses = [R3]
//===----------------------------------------------------------------------===//
// Assembler Pseudo Instructions
diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
index 7416c93b4d05..488c66f47863 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
@@ -22,6 +22,22 @@
using namespace llvm;
+static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
+ const AsmPrinter &AP) {
+ MCContext &Ctx = AP.OutContext;
+
+ // TODO: Processing target flags.
+
+ const MCExpr *ME =
+ MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+
+ if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
+ ME = MCBinaryExpr::createAdd(
+ ME, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+
+ return MCOperand::createExpr(ME);
+}
+
bool llvm::lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO,
MCOperand &MCOp,
const AsmPrinter &AP) {
@@ -41,12 +57,21 @@ bool llvm::lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO,
case MachineOperand::MO_Immediate:
MCOp = MCOperand::createImm(MO.getImm());
break;
- // TODO: lower special operands
- case MachineOperand::MO_MachineBasicBlock:
+ case MachineOperand::MO_ConstantPoolIndex:
+ MCOp = lowerSymbolOperand(MO, AP.GetCPISymbol(MO.getIndex()), AP);
+ break;
case MachineOperand::MO_GlobalAddress:
- case MachineOperand::MO_BlockAddress:
+ MCOp = lowerSymbolOperand(MO, AP.getSymbolPreferLocal(*MO.getGlobal()), AP);
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = lowerSymbolOperand(MO, MO.getMBB()->getSymbol(), AP);
+ break;
case MachineOperand::MO_ExternalSymbol:
- case MachineOperand::MO_ConstantPoolIndex:
+ MCOp = lowerSymbolOperand(
+ MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()), AP);
+ break;
+ // TODO: lower special operands
+ case MachineOperand::MO_BlockAddress:
case MachineOperand::MO_JumpTableIndex:
break;
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
index b9bae8e56304..05902ebb7ba6 100644
--- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
@@ -110,6 +110,28 @@ void LoongArchRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS) const {
+ // TODO: this implementation is a temporary placeholder which does just
+ // enough to allow other aspects of code generation to be tested.
+
assert(SPAdj == 0 && "Unexpected non-zero SPAdj value");
- // TODO: Implement this when we have function calls
+
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ DebugLoc DL = MI.getDebugLoc();
+
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+ Register FrameReg;
+ StackOffset Offset =
+ TFI->getFrameIndexReference(MF, FrameIndex, FrameReg) +
+ StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm());
+
+ // Offsets must be encodable with a 12-bit immediate field.
+ if (!isInt<12>(Offset.getFixed())) {
+ report_fatal_error("Frame offsets outside of the signed 12-bit range is "
+ "not supported currently");
+ }
+
+ MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed());
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 3a1a46a9e624..468c4f43cb90 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -102,6 +102,7 @@ public:
return getTM<LoongArchTargetMachine>();
}
+ void addIRPasses() override;
bool addInstSelector() override;
};
} // namespace
@@ -111,6 +112,12 @@ LoongArchTargetMachine::createPassConfig(PassManagerBase &PM) {
return new LoongArchPassConfig(*this, PM);
}
+void LoongArchPassConfig::addIRPasses() {
+ addPass(createAtomicExpandPass());
+
+ TargetPassConfig::addIRPasses();
+}
+
bool LoongArchPassConfig::addInstSelector() {
addPass(createLoongArchISelDag(getLoongArchTargetMachine()));
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
index c733c194e6a2..e50761ab1e27 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
@@ -25,6 +25,7 @@
#include "llvm/Support/Compiler.h"
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "LoongArchGenInstrInfo.inc"
#define GET_REGINFO_MC_DESC
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
index e576b9a49cd6..a606ccdbc47c 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
@@ -46,6 +46,7 @@ createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit);
// Defines symbolic names for LoongArch instructions.
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "LoongArchGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
index 3bcce9e3ba3b..4933d40f3388 100644
--- a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
+++ b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
@@ -77,6 +77,9 @@ bool M68kAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
}
void M68kAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ M68k_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
switch (MI->getOpcode()) {
default: {
if (MI->isPseudo()) {
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp
index 2606e22410fc..e6290d4cbec5 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp
@@ -31,6 +31,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "M68kGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h
index 0dc601ad876b..2a1cc678016a 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h
@@ -52,6 +52,7 @@ std::unique_ptr<MCObjectTargetWriter> createM68kELFObjectWriter(uint8_t OSABI);
// Defines symbolic names for the M68k instructions.
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "M68kGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index 3f006056955d..13a880de68b5 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -22,6 +22,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "MSP430GenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
index 24b0b3298592..e596c3f1ce46 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
@@ -53,6 +53,7 @@ createMSP430ELFObjectWriter(uint8_t OSABI);
// Defines symbolic names for the MSP430 instructions.
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "MSP430GenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 85c59d5b14b5..9cd2cbe89e46 100644
--- a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -149,6 +149,9 @@ bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
//===----------------------------------------------------------------------===//
void MSP430AsmPrinter::emitInstruction(const MachineInstr *MI) {
+ MSP430_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
MSP430MCInstLower MCInstLowering(OutContext, *this);
MCInst TmpInst;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 6fc8fcb482cd..40c807082fdc 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -36,6 +36,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "MipsGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 8531177ee924..d51f3b9abcfd 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -55,6 +55,7 @@ StringRef selectMipsCPU(const Triple &TT, StringRef CPU);
// Defines symbolic names for the Mips instructions.
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "MipsGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 9330a791a7cc..fcaf450cc511 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -181,6 +181,10 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI,
}
void MipsAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ // FIXME: Enable feature predicate checks once all the test pass.
+ // Mips_MC::verifyInstructionPredicates(MI->getOpcode(),
+ // getSubtargetInfo().getFeatureBits());
+
MipsTargetStreamer &TS = getTargetStreamer();
unsigned Opc = MI->getOpcode();
TS.forbidModuleDirective();
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index 856d03f0b210..0ba29fb48b05 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -23,6 +23,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "NVPTXGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
index b394566edd0d..78f4e6745502 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
@@ -21,6 +21,7 @@
// Defines symbolic names for the PTX instructions.
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "NVPTXGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 41e9f375e536..8c92766faecb 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -183,6 +183,7 @@ enum CmpMode {
// Defines symbolic names for the NVPTX instructions.
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "NVPTXGenInstrInfo.inc"
#endif
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index b1d842122060..9977d8ba0300 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -139,6 +139,9 @@ VisitGlobalVariableForEmission(const GlobalVariable *GV,
}
void NVPTXAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ NVPTX_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
MCInst Inst;
lowerToMCInst(MI, Inst);
EmitToStreamer(*OutStreamer, Inst);
diff --git a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 2201eb19c80f..b4f7a64f144a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -270,10 +270,6 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
// ShuffleVector
return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1],
NewOperands[2]);
- case Instruction::InsertValue:
- // InsertValueConstantExpr
- return Builder.CreateInsertValue(NewOperands[0], NewOperands[1],
- C->getIndices());
case Instruction::GetElementPtr:
// GetElementPtrConstantExpr
return Builder.CreateGEP(cast<GEPOperator>(C)->getSourceElementType(),
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 746f652bfa36..6ad016dfa0a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1861,7 +1861,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
InFlag = Ret.getValue(2);
if (ProxyRegTruncates[i]) {
- Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
+ Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].value(), Ret);
}
InVals.push_back(Ret);
diff --git a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index eeedce2d99cb..202134ed7035 100644
--- a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -35,6 +35,8 @@ public:
bool runOnFunction(Function &F) override;
+ StringRef getPassName() const override { return "NVPTX Image Optimizer"; }
+
private:
bool replaceIsTypePSampler(Instruction &I);
bool replaceIsTypePSurface(Instruction &I);
diff --git a/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 16fbe1a65562..7929bd2e0df0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -36,6 +36,8 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
+ StringRef getPassName() const override { return "NVPTX Prolog Epilog Pass"; }
+
private:
void calculateFrameObjectOffsets(MachineFunction &Fn);
};
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 2d6d72777db2..4e41515b997d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -18,7 +18,6 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
-#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/Mutex.h"
#include <algorithm>
#include <cstring>
@@ -32,19 +31,27 @@ namespace llvm {
namespace {
typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t;
typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t;
-typedef std::map<const Module *, global_val_annot_t> per_module_annot_t;
-} // anonymous namespace
-static ManagedStatic<per_module_annot_t> annotationCache;
-static sys::Mutex Lock;
+struct AnnotationCache {
+ sys::Mutex Lock;
+ std::map<const Module *, global_val_annot_t> Cache;
+};
+
+AnnotationCache &getAnnotationCache() {
+ static AnnotationCache AC;
+ return AC;
+}
+} // anonymous namespace
void clearAnnotationCache(const Module *Mod) {
- std::lock_guard<sys::Mutex> Guard(Lock);
- annotationCache->erase(Mod);
+ auto &AC = getAnnotationCache();
+ std::lock_guard<sys::Mutex> Guard(AC.Lock);
+ AC.Cache.erase(Mod);
}
static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
- std::lock_guard<sys::Mutex> Guard(Lock);
+ auto &AC = getAnnotationCache();
+ std::lock_guard<sys::Mutex> Guard(AC.Lock);
assert(md && "Invalid mdnode for annotation");
assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
// start index = 1, to skip the global variable key
@@ -70,7 +77,8 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
}
static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
- std::lock_guard<sys::Mutex> Guard(Lock);
+ auto &AC = getAnnotationCache();
+ std::lock_guard<sys::Mutex> Guard(AC.Lock);
NamedMDNode *NMD = m->getNamedMetadata("nvvm.annotations");
if (!NMD)
return;
@@ -93,40 +101,42 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
if (tmp.empty()) // no annotations for this gv
return;
- if ((*annotationCache).find(m) != (*annotationCache).end())
- (*annotationCache)[m][gv] = std::move(tmp);
+ if (AC.Cache.find(m) != AC.Cache.end())
+ AC.Cache[m][gv] = std::move(tmp);
else {
global_val_annot_t tmp1;
tmp1[gv] = std::move(tmp);
- (*annotationCache)[m] = std::move(tmp1);
+ AC.Cache[m] = std::move(tmp1);
}
}
bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
unsigned &retval) {
- std::lock_guard<sys::Mutex> Guard(Lock);
+ auto &AC = getAnnotationCache();
+ std::lock_guard<sys::Mutex> Guard(AC.Lock);
const Module *m = gv->getParent();
- if ((*annotationCache).find(m) == (*annotationCache).end())
+ if (AC.Cache.find(m) == AC.Cache.end())
cacheAnnotationFromMD(m, gv);
- else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+ else if (AC.Cache[m].find(gv) == AC.Cache[m].end())
cacheAnnotationFromMD(m, gv);
- if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+ if (AC.Cache[m][gv].find(prop) == AC.Cache[m][gv].end())
return false;
- retval = (*annotationCache)[m][gv][prop][0];
+ retval = AC.Cache[m][gv][prop][0];
return true;
}
bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
std::vector<unsigned> &retval) {
- std::lock_guard<sys::Mutex> Guard(Lock);
+ auto &AC = getAnnotationCache();
+ std::lock_guard<sys::Mutex> Guard(AC.Lock);
const Module *m = gv->getParent();
- if ((*annotationCache).find(m) == (*annotationCache).end())
+ if (AC.Cache.find(m) == AC.Cache.end())
cacheAnnotationFromMD(m, gv);
- else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+ else if (AC.Cache[m].find(gv) == AC.Cache[m].end())
cacheAnnotationFromMD(m, gv);
- if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+ if (AC.Cache[m][gv].find(prop) == AC.Cache[m][gv].end())
return false;
- retval = (*annotationCache)[m][gv][prop];
+ retval = AC.Cache[m][gv][prop];
return true;
}
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 46bbc44e1681..fa9e69f2e607 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -449,12 +449,9 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
return MO.getImm();
}
-void PPCMCCodeEmitter::encodeInstruction(
- const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- verifyInstructionPredicates(MI,
- computeAvailableFeatures(STI.getFeatureBits()));
-
+void PPCMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
// Output the constant in big/little endian byte order.
@@ -492,5 +489,4 @@ bool PPCMCCodeEmitter::isPrefixedInstruction(const MCInst &MI) const {
return InstrInfo->isPrefixed(Opcode);
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "PPCGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
index 39b2f1211f29..c4d4d35a6665 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
@@ -121,12 +121,6 @@ public:
// Is this instruction a prefixed instruction.
bool isPrefixedInstruction(const MCInst &MI) const;
-
-private:
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
};
} // namespace llvm
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index a651362f703b..1008dc63d064 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -48,6 +48,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "PPCGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index acb860e16518..3ca6f394f60b 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -118,6 +118,7 @@ static inline bool isRunOfOnes64(uint64_t Val, unsigned &MB, unsigned &ME) {
//
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_SCHED_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "PPCGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 22f35c8fa8d3..58a75baf8081 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -230,6 +230,9 @@ private:
void emitGlobalVariableHelper(const GlobalVariable *);
+ // Get the offset of an alias based on its AliaseeObject.
+ uint64_t getAliasOffset(const Constant *C);
+
public:
PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
: PPCAsmPrinter(TM, std::move(Streamer)) {
@@ -656,6 +659,9 @@ static MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO,
/// the current output stream.
///
void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ PPC_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
MCInst TmpInst;
const bool IsPPC64 = Subtarget->isPPC64();
const bool IsAIX = Subtarget->isAIXABI();
@@ -2352,6 +2358,24 @@ static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) {
.Default(false);
}
+uint64_t PPCAIXAsmPrinter::getAliasOffset(const Constant *C) {
+ if (auto *GA = dyn_cast<GlobalAlias>(C))
+ return getAliasOffset(GA->getAliasee());
+ if (auto *CE = dyn_cast<ConstantExpr>(C)) {
+ const MCExpr *LowC = lowerConstant(CE);
+ const MCBinaryExpr *CBE = dyn_cast<MCBinaryExpr>(LowC);
+ if (!CBE)
+ return 0;
+ if (CBE->getOpcode() != MCBinaryExpr::Add)
+ report_fatal_error("Only adding an offset is supported now.");
+ auto *RHS = dyn_cast<MCConstantExpr>(CBE->getRHS());
+ if (!RHS)
+ report_fatal_error("Unable to get the offset of alias.");
+ return RHS->getValue();
+ }
+ return 0;
+}
+
void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
// Special LLVM global arrays have been handled at the initialization.
if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV))
@@ -2422,20 +2446,34 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
}
MCSymbol *EmittedInitSym = GVSym;
+
+ // Emit linkage for the global variable and its aliases.
emitLinkage(GV, EmittedInitSym);
+ for (const GlobalAlias *GA : GOAliasMap[GV])
+ emitLinkage(GA, getSymbol(GA));
+
emitAlignment(getGVAlignment(GV, DL), GV);
// When -fdata-sections is enabled, every GlobalVariable will
// be put into its own csect; therefore, label is not necessary here.
- if (!TM.getDataSections() || GV->hasSection()) {
+ if (!TM.getDataSections() || GV->hasSection())
OutStreamer->emitLabel(EmittedInitSym);
+
+ // No alias to emit.
+ if (!GOAliasMap[GV].size()) {
+ emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
+ return;
}
- // Emit aliasing label for global variable.
- for (const GlobalAlias *Alias : GOAliasMap[GV])
- OutStreamer->emitLabel(getSymbol(Alias));
+ // Aliases with the same offset should be aligned. Record the list of aliases
+ // associated with the offset.
+ AliasMapTy AliasList;
+ for (const GlobalAlias *GA : GOAliasMap[GV])
+ AliasList[getAliasOffset(GA->getAliasee())].push_back(GA);
- emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
+ // Emit alias label and element value for global variable.
+ emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer(),
+ &AliasList);
}
void PPCAIXAsmPrinter::emitFunctionDescriptor() {
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 5b9d1e66b04e..3c461a627d61 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -392,8 +392,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// MASS transformation for LLVM intrinsics with replicating fast-math flag
// to be consistent to PPCGenScalarMASSEntries pass
- if (TM.getOptLevel() == CodeGenOpt::Aggressive &&
- TM.Options.PPCGenScalarMASSEntries) {
+ if (TM.getOptLevel() == CodeGenOpt::Aggressive) {
setOperationAction(ISD::FSIN , MVT::f64, Custom);
setOperationAction(ISD::FCOS , MVT::f64, Custom);
setOperationAction(ISD::FPOW , MVT::f64, Custom);
@@ -17886,13 +17885,17 @@ bool PPCTargetLowering::isLowringToMASSSafe(SDValue Op) const {
return Op.getNode()->getFlags().hasApproximateFuncs();
}
+bool PPCTargetLowering::isScalarMASSConversionEnabled() const {
+ return getTargetMachine().Options.PPCGenScalarMASSEntries;
+}
+
SDValue PPCTargetLowering::lowerLibCallBase(const char *LibCallDoubleName,
const char *LibCallFloatName,
const char *LibCallDoubleNameFinite,
const char *LibCallFloatNameFinite,
SDValue Op,
SelectionDAG &DAG) const {
- if (!isLowringToMASSSafe(Op))
+ if (!isScalarMASSConversionEnabled() || !isLowringToMASSSafe(Op))
return SDValue();
if (!isLowringToMASSFiniteSafe(Op))
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index f92a117fe27f..4a08cc42fa9d 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1293,6 +1293,7 @@ namespace llvm {
SelectionDAG &DAG) const;
bool isLowringToMASSFiniteSafe(SDValue Op) const;
bool isLowringToMASSSafe(SDValue Op) const;
+ bool isScalarMASSConversionEnabled() const;
SDValue lowerLibCallBase(const char *LibCallDoubleName,
const char *LibCallFloatName,
const char *LibCallDoubleNameFinite,
diff --git a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index fbd487fbcfd5..59e8f3ff84a4 100644
--- a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -43,7 +43,6 @@ namespace {
}
const PPCInstrInfo *TII;
- LiveIntervals *LIS;
protected:
bool processBlock(MachineBasicBlock &MBB) {
@@ -83,11 +82,8 @@ protected:
Register InReg = PPC::NoRegister;
Register GPR3 = Is64Bit ? PPC::X3 : PPC::R3;
Register GPR4 = Is64Bit ? PPC::X4 : PPC::R4;
- SmallVector<Register, 3> OrigRegs = {OutReg, GPR3};
- if (!IsPCREL) {
+ if (!IsPCREL)
InReg = MI.getOperand(1).getReg();
- OrigRegs.push_back(InReg);
- }
DebugLoc DL = MI.getDebugLoc();
unsigned Opc1, Opc2;
@@ -139,11 +135,6 @@ protected:
BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
.addImm(0);
- // The ADDItls* instruction is the first instruction in the
- // repair range.
- MachineBasicBlock::iterator First = I;
- --First;
-
if (IsAIX) {
// The variable offset and region handle are copied in r4 and r3. The
// copies are followed by GETtlsADDR32AIX/GETtlsADDR64AIX.
@@ -177,16 +168,10 @@ protected:
BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg)
.addReg(GPR3);
- // The COPY is the last instruction in the repair range.
- MachineBasicBlock::iterator Last = I;
- --Last;
-
// Move past the original instruction and remove it.
++I;
MI.removeFromParent();
- // Repair the live intervals.
- LIS->repairIntervalsInRange(&MBB, First, Last, OrigRegs);
Changed = true;
}
@@ -204,7 +189,6 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override {
TII = MF.getSubtarget<PPCSubtarget>().getInstrInfo();
- LIS = &getAnalysis<LiveIntervals>();
bool Changed = false;
@@ -217,9 +201,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveIntervals>();
- AU.addPreserved<LiveIntervals>();
AU.addRequired<SlotIndexes>();
- AU.addPreserved<SlotIndexes>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 7c062387fecd..a335b2d23394 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -84,12 +84,6 @@ public:
unsigned getVMaskReg(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
-
-private:
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
};
} // end anonymous namespace
@@ -188,9 +182,6 @@ void RISCVMCCodeEmitter::expandAddTPRel(const MCInst &MI, raw_ostream &OS,
void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
- verifyInstructionPredicates(MI,
- computeAvailableFeatures(STI.getFeatureBits()));
-
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
// Get byte count of instruction.
unsigned Size = Desc.getSize();
@@ -403,5 +394,4 @@ unsigned RISCVMCCodeEmitter::getVMaskReg(const MCInst &MI, unsigned OpNo,
}
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "RISCVGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 917d93479f18..c63e0c8e737d 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -33,6 +33,7 @@
#include "llvm/Support/ErrorHandling.h"
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "RISCVGenInstrInfo.inc"
#define GET_REGINFO_MC_DESC
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
index 276fc9efb6c0..d157257d976c 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
@@ -45,6 +45,7 @@ std::unique_ptr<MCObjectTargetWriter> createRISCVELFObjectWriter(uint8_t OSABI,
// Defines symbolic names for RISC-V instructions.
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "RISCVGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 5b2a247ebda0..edd39f6547ed 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -91,6 +91,9 @@ void RISCVAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
#include "RISCVGenMCPseudoLowering.inc"
void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ RISCV_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
// Do any auto-generated pseudo lowerings.
if (emitPseudoExpansionLowering(*OutStreamer, MI))
return;
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 57d8ba6f0161..a7286b2963c2 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -899,7 +899,8 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
}
std::pair<int64_t, Align>
-RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const {
+RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFunction &MF) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
// Create a buffer of RVV objects to allocate.
SmallVector<int, 8> ObjectsToAllocate;
for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
@@ -912,10 +913,18 @@ RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const {
ObjectsToAllocate.push_back(I);
}
- // Allocate all RVV locals and spills
- int64_t Offset = 0;
// The minimum alignment is 16 bytes.
Align RVVStackAlign(16);
+ const auto &ST = MF.getSubtarget<RISCVSubtarget>();
+
+ if (!ST.hasVInstructions()) {
+ assert(ObjectsToAllocate.empty() &&
+ "Can't allocate scalable-vector objects without V instructions");
+ return std::make_pair(0, RVVStackAlign);
+ }
+
+ // Allocate all RVV locals and spills
+ int64_t Offset = 0;
for (int FI : ObjectsToAllocate) {
// ObjectSize in bytes.
int64_t ObjectSize = MFI.getObjectSize(FI);
@@ -997,7 +1006,7 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
int64_t RVVStackSize;
Align RVVStackAlign;
- std::tie(RVVStackSize, RVVStackAlign) = assignRVVStackObjectOffsets(MFI);
+ std::tie(RVVStackSize, RVVStackAlign) = assignRVVStackObjectOffsets(MF);
RVFI->setRVVStackSize(RVVStackSize);
RVFI->setRVVStackAlign(RVVStackAlign);
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 466cd059b749..a5cf68a6ea94 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -84,7 +84,7 @@ private:
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
int64_t Amount, MachineInstr::MIFlag Flag) const;
std::pair<int64_t, Align>
- assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const;
+ assignRVVStackObjectOffsets(MachineFunction &MF) const;
};
-}
+} // namespace llvm
#endif
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index cfaafc7b53d2..5b823af1e9b8 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -43,92 +43,95 @@ namespace RISCV {
} // namespace llvm
void RISCVDAGToDAGISel::PreprocessISelDAG() {
- for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
- E = CurDAG->allnodes_end();
- I != E;) {
- SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
-
- // Convert integer SPLAT_VECTOR to VMV_V_X_VL and floating-point
- // SPLAT_VECTOR to VFMV_V_F_VL to reduce isel burden.
- if (N->getOpcode() == ISD::SPLAT_VECTOR) {
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+
+ bool MadeChange = false;
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ if (N->use_empty())
+ continue;
+
+ SDValue Result;
+ switch (N->getOpcode()) {
+ case ISD::SPLAT_VECTOR: {
+ // Convert integer SPLAT_VECTOR to VMV_V_X_VL and floating-point
+ // SPLAT_VECTOR to VFMV_V_F_VL to reduce isel burden.
MVT VT = N->getSimpleValueType(0);
unsigned Opc =
VT.isInteger() ? RISCVISD::VMV_V_X_VL : RISCVISD::VFMV_V_F_VL;
SDLoc DL(N);
SDValue VL = CurDAG->getRegister(RISCV::X0, Subtarget->getXLenVT());
- SDValue Result = CurDAG->getNode(Opc, DL, VT, CurDAG->getUNDEF(VT),
- N->getOperand(0), VL);
+ Result = CurDAG->getNode(Opc, DL, VT, CurDAG->getUNDEF(VT),
+ N->getOperand(0), VL);
+ break;
+ }
+ case RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL: {
+ // Lower SPLAT_VECTOR_SPLIT_I64 to two scalar stores and a stride 0 vector
+ // load. Done after lowering and combining so that we have a chance to
+ // optimize this to VMV_V_X_VL when the upper bits aren't needed.
+ assert(N->getNumOperands() == 4 && "Unexpected number of operands");
+ MVT VT = N->getSimpleValueType(0);
+ SDValue Passthru = N->getOperand(0);
+ SDValue Lo = N->getOperand(1);
+ SDValue Hi = N->getOperand(2);
+ SDValue VL = N->getOperand(3);
+ assert(VT.getVectorElementType() == MVT::i64 && VT.isScalableVector() &&
+ Lo.getValueType() == MVT::i32 && Hi.getValueType() == MVT::i32 &&
+ "Unexpected VTs!");
+ MachineFunction &MF = CurDAG->getMachineFunction();
+ RISCVMachineFunctionInfo *FuncInfo =
+ MF.getInfo<RISCVMachineFunctionInfo>();
+ SDLoc DL(N);
- --I;
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
- ++I;
- CurDAG->DeleteNode(N);
- continue;
+ // We use the same frame index we use for moving two i32s into 64-bit FPR.
+ // This is an analogous operation.
+ int FI = FuncInfo->getMoveF64FrameIndex(MF);
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
+ const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
+ SDValue StackSlot =
+ CurDAG->getFrameIndex(FI, TLI.getPointerTy(CurDAG->getDataLayout()));
+
+ SDValue Chain = CurDAG->getEntryNode();
+ Lo = CurDAG->getStore(Chain, DL, Lo, StackSlot, MPI, Align(8));
+
+ SDValue OffsetSlot =
+ CurDAG->getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), DL);
+ Hi = CurDAG->getStore(Chain, DL, Hi, OffsetSlot, MPI.getWithOffset(4),
+ Align(8));
+
+ Chain = CurDAG->getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+
+ SDVTList VTs = CurDAG->getVTList({VT, MVT::Other});
+ SDValue IntID =
+ CurDAG->getTargetConstant(Intrinsic::riscv_vlse, DL, MVT::i64);
+ SDValue Ops[] = {Chain,
+ IntID,
+ Passthru,
+ StackSlot,
+ CurDAG->getRegister(RISCV::X0, MVT::i64),
+ VL};
+
+ Result = CurDAG->getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
+ MVT::i64, MPI, Align(8),
+ MachineMemOperand::MOLoad);
+ break;
+ }
}
- // Lower SPLAT_VECTOR_SPLIT_I64 to two scalar stores and a stride 0 vector
- // load. Done after lowering and combining so that we have a chance to
- // optimize this to VMV_V_X_VL when the upper bits aren't needed.
- if (N->getOpcode() != RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL)
- continue;
+ if (Result) {
+ LLVM_DEBUG(dbgs() << "RISCV DAG preprocessing replacing:\nOld: ");
+ LLVM_DEBUG(N->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\nNew: ");
+ LLVM_DEBUG(Result->dump(CurDAG));
+ LLVM_DEBUG(dbgs() << "\n");
- assert(N->getNumOperands() == 4 && "Unexpected number of operands");
- MVT VT = N->getSimpleValueType(0);
- SDValue Passthru = N->getOperand(0);
- SDValue Lo = N->getOperand(1);
- SDValue Hi = N->getOperand(2);
- SDValue VL = N->getOperand(3);
- assert(VT.getVectorElementType() == MVT::i64 && VT.isScalableVector() &&
- Lo.getValueType() == MVT::i32 && Hi.getValueType() == MVT::i32 &&
- "Unexpected VTs!");
- MachineFunction &MF = CurDAG->getMachineFunction();
- RISCVMachineFunctionInfo *FuncInfo = MF.getInfo<RISCVMachineFunctionInfo>();
- SDLoc DL(N);
-
- // We use the same frame index we use for moving two i32s into 64-bit FPR.
- // This is an analogous operation.
- int FI = FuncInfo->getMoveF64FrameIndex(MF);
- MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
- const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
- SDValue StackSlot =
- CurDAG->getFrameIndex(FI, TLI.getPointerTy(CurDAG->getDataLayout()));
-
- SDValue Chain = CurDAG->getEntryNode();
- Lo = CurDAG->getStore(Chain, DL, Lo, StackSlot, MPI, Align(8));
-
- SDValue OffsetSlot =
- CurDAG->getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), DL);
- Hi = CurDAG->getStore(Chain, DL, Hi, OffsetSlot, MPI.getWithOffset(4),
- Align(8));
-
- Chain = CurDAG->getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
-
- SDVTList VTs = CurDAG->getVTList({VT, MVT::Other});
- SDValue IntID =
- CurDAG->getTargetConstant(Intrinsic::riscv_vlse, DL, MVT::i64);
- SDValue Ops[] = {Chain,
- IntID,
- Passthru,
- StackSlot,
- CurDAG->getRegister(RISCV::X0, MVT::i64),
- VL};
-
- SDValue Result = CurDAG->getMemIntrinsicNode(
- ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MVT::i64, MPI, Align(8),
- MachineMemOperand::MOLoad);
-
- // We're about to replace all uses of the SPLAT_VECTOR_SPLIT_I64 with the
- // vlse we created. This will cause general havok on the dag because
- // anything below the conversion could be folded into other existing nodes.
- // To avoid invalidating 'I', back it up to the convert node.
- --I;
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
-
- // Now that we did that, the node is dead. Increment the iterator to the
- // next node to process, then delete N.
- ++I;
- CurDAG->DeleteNode(N);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+ MadeChange = true;
+ }
}
+
+ if (MadeChange)
+ CurDAG->RemoveDeadNodes();
}
void RISCVDAGToDAGISel::PostprocessISelDAG() {
@@ -143,7 +146,6 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() {
continue;
MadeChange |= doPeepholeSExtW(N);
- MadeChange |= doPeepholeLoadStoreADDI(N);
MadeChange |= doPeepholeMaskedRVV(N);
}
@@ -153,40 +155,6 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() {
CurDAG->RemoveDeadNodes();
}
-// Returns true if N is a MachineSDNode that has a reg and simm12 memory
-// operand. The indices of the base pointer and offset are returned in BaseOpIdx
-// and OffsetOpIdx.
-static bool hasMemOffset(SDNode *N, unsigned &BaseOpIdx,
- unsigned &OffsetOpIdx) {
- switch (N->getMachineOpcode()) {
- case RISCV::LB:
- case RISCV::LH:
- case RISCV::LW:
- case RISCV::LBU:
- case RISCV::LHU:
- case RISCV::LWU:
- case RISCV::LD:
- case RISCV::FLH:
- case RISCV::FLW:
- case RISCV::FLD:
- BaseOpIdx = 0;
- OffsetOpIdx = 1;
- return true;
- case RISCV::SB:
- case RISCV::SH:
- case RISCV::SW:
- case RISCV::SD:
- case RISCV::FSH:
- case RISCV::FSW:
- case RISCV::FSD:
- BaseOpIdx = 1;
- OffsetOpIdx = 2;
- return true;
- }
-
- return false;
-}
-
static SDNode *selectImmSeq(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT,
RISCVMatInt::InstSeq &Seq) {
SDNode *Result = nullptr;
@@ -285,9 +253,7 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
SDValue Chain = Node->getOperand(0);
SDValue Glue;
- SDValue Base;
- SelectBaseAddr(Node->getOperand(CurOp++), Base);
- Operands.push_back(Base); // Base pointer.
+ Operands.push_back(Node->getOperand(CurOp++)); // Base pointer.
if (IsStridedOrIndexed) {
Operands.push_back(Node->getOperand(CurOp++)); // Index.
@@ -651,83 +617,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, selectImm(CurDAG, DL, VT, Imm, *Subtarget));
return;
}
- case ISD::ADD: {
- // Try to select ADD + immediate used as memory addresses to
- // (ADDI (ADD X, Imm-Lo12), Lo12) if it will allow the ADDI to be removed by
- // doPeepholeLoadStoreADDI.
-
- // LHS should be an immediate.
- auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
- if (!N1C)
- break;
-
- int64_t Offset = N1C->getSExtValue();
- int64_t Lo12 = SignExtend64<12>(Offset);
-
- // Don't do this if the lower 12 bits are 0 or we could use ADDI directly.
- if (Lo12 == 0 || isInt<12>(Offset))
- break;
-
- // Don't do this if we can use a pair of ADDIs.
- if (isInt<12>(Offset / 2) && isInt<12>(Offset - Offset / 2))
- break;
-
- RISCVMatInt::InstSeq Seq =
- RISCVMatInt::generateInstSeq(Offset, Subtarget->getFeatureBits());
-
- Offset -= Lo12;
- // Restore sign bits for RV32.
- if (!Subtarget->is64Bit())
- Offset = SignExtend64<32>(Offset);
-
- // We can fold if the last operation is an ADDI or its an ADDIW that could
- // be treated as an ADDI.
- if (Seq.back().Opc != RISCV::ADDI &&
- !(Seq.back().Opc == RISCV::ADDIW && isInt<32>(Offset)))
- break;
- assert(Seq.back().Imm == Lo12 && "Expected immediate to match Lo12");
- // Drop the last operation.
- Seq.pop_back();
- assert(!Seq.empty() && "Expected more instructions in sequence");
-
- bool AllPointerUses = true;
- for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) {
- SDNode *User = *UI;
-
- // Is this user a memory instruction that uses a register and immediate
- // that has this ADD as its pointer.
- unsigned BaseOpIdx, OffsetOpIdx;
- if (!User->isMachineOpcode() ||
- !hasMemOffset(User, BaseOpIdx, OffsetOpIdx) ||
- UI.getOperandNo() != BaseOpIdx) {
- AllPointerUses = false;
- break;
- }
-
- // If the memory instruction already has an offset, make sure the combined
- // offset is foldable.
- int64_t MemOffs =
- cast<ConstantSDNode>(User->getOperand(OffsetOpIdx))->getSExtValue();
- MemOffs += Lo12;
- if (!isInt<12>(MemOffs)) {
- AllPointerUses = false;
- break;
- }
- }
-
- if (!AllPointerUses)
- break;
-
- // Emit (ADDI (ADD X, Hi), Lo)
- SDNode *Imm = selectImmSeq(CurDAG, DL, VT, Seq);
- SDNode *ADD = CurDAG->getMachineNode(RISCV::ADD, DL, VT,
- Node->getOperand(0), SDValue(Imm, 0));
- SDNode *ADDI =
- CurDAG->getMachineNode(RISCV::ADDI, DL, VT, SDValue(ADD, 0),
- CurDAG->getTargetConstant(Lo12, DL, VT));
- ReplaceNode(Node, ADDI);
- return;
- }
case ISD::SHL: {
auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
if (!N1C)
@@ -856,10 +745,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!C)
break;
- uint64_t C2 = C->getZExtValue();
+ unsigned C2 = C->getZExtValue();
unsigned XLen = Subtarget->getXLen();
- if (!C2 || C2 >= XLen)
- break;
+ assert((C2 > 0 && C2 < XLen) && "Unexpected shift amount!");
uint64_t C1 = N1C->getZExtValue();
@@ -885,10 +773,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
// Turn (and (srl x, c2) c1) -> (srli (slli x, c3-c2), c3) if c1 is a mask
// with c3 leading zeros.
if (!LeftShift && isMask_64(C1)) {
- uint64_t C3 = XLen - (64 - countLeadingZeros(C1));
- if (C2 < C3) {
+ unsigned Leading = XLen - (64 - countLeadingZeros(C1));
+ if (C2 < Leading) {
// If the number of leading zeros is C2+32 this can be SRLIW.
- if (C2 + 32 == C3) {
+ if (C2 + 32 == Leading) {
SDNode *SRLIW = CurDAG->getMachineNode(
RISCV::SRLIW, DL, VT, X, CurDAG->getTargetConstant(C2, DL, VT));
ReplaceNode(Node, SRLIW);
@@ -900,7 +788,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
//
// This pattern occurs when (i32 (srl (sra 31), c3 - 32)) is type
// legalized and goes through DAG combine.
- if (C2 >= 32 && (C3 - C2) == 1 && N0.hasOneUse() &&
+ if (C2 >= 32 && (Leading - C2) == 1 && N0.hasOneUse() &&
X.getOpcode() == ISD::SIGN_EXTEND_INREG &&
cast<VTSDNode>(X.getOperand(1))->getVT() == MVT::i32) {
SDNode *SRAIW =
@@ -908,25 +796,25 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
CurDAG->getTargetConstant(31, DL, VT));
SDNode *SRLIW = CurDAG->getMachineNode(
RISCV::SRLIW, DL, VT, SDValue(SRAIW, 0),
- CurDAG->getTargetConstant(C3 - 32, DL, VT));
+ CurDAG->getTargetConstant(Leading - 32, DL, VT));
ReplaceNode(Node, SRLIW);
return;
}
// (srli (slli x, c3-c2), c3).
// Skip if we could use (zext.w (sraiw X, C2)).
- bool Skip = Subtarget->hasStdExtZba() && C3 == 32 &&
+ bool Skip = Subtarget->hasStdExtZba() && Leading == 32 &&
X.getOpcode() == ISD::SIGN_EXTEND_INREG &&
cast<VTSDNode>(X.getOperand(1))->getVT() == MVT::i32;
// Also Skip if we can use bexti.
- Skip |= Subtarget->hasStdExtZbs() && C3 == XLen - 1;
+ Skip |= Subtarget->hasStdExtZbs() && Leading == XLen - 1;
if (OneUseOrZExtW && !Skip) {
SDNode *SLLI = CurDAG->getMachineNode(
RISCV::SLLI, DL, VT, X,
- CurDAG->getTargetConstant(C3 - C2, DL, VT));
- SDNode *SRLI =
- CurDAG->getMachineNode(RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
- CurDAG->getTargetConstant(C3, DL, VT));
+ CurDAG->getTargetConstant(Leading - C2, DL, VT));
+ SDNode *SRLI = CurDAG->getMachineNode(
+ RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
+ CurDAG->getTargetConstant(Leading, DL, VT));
ReplaceNode(Node, SRLI);
return;
}
@@ -936,12 +824,12 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
// Turn (and (shl x, c2), c1) -> (srli (slli c2+c3), c3) if c1 is a mask
// shifted by c2 bits with c3 leading zeros.
if (LeftShift && isShiftedMask_64(C1)) {
- uint64_t C3 = XLen - (64 - countLeadingZeros(C1));
+ unsigned Leading = XLen - (64 - countLeadingZeros(C1));
- if (C2 + C3 < XLen &&
- C1 == (maskTrailingOnes<uint64_t>(XLen - (C2 + C3)) << C2)) {
+ if (C2 + Leading < XLen &&
+ C1 == (maskTrailingOnes<uint64_t>(XLen - (C2 + Leading)) << C2)) {
// Use slli.uw when possible.
- if ((XLen - (C2 + C3)) == 32 && Subtarget->hasStdExtZba()) {
+ if ((XLen - (C2 + Leading)) == 32 && Subtarget->hasStdExtZba()) {
SDNode *SLLI_UW = CurDAG->getMachineNode(
RISCV::SLLI_UW, DL, VT, X, CurDAG->getTargetConstant(C2, DL, VT));
ReplaceNode(Node, SLLI_UW);
@@ -952,10 +840,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
if (OneUseOrZExtW && !IsCANDI) {
SDNode *SLLI = CurDAG->getMachineNode(
RISCV::SLLI, DL, VT, X,
- CurDAG->getTargetConstant(C2 + C3, DL, VT));
- SDNode *SRLI =
- CurDAG->getMachineNode(RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
- CurDAG->getTargetConstant(C3, DL, VT));
+ CurDAG->getTargetConstant(C2 + Leading, DL, VT));
+ SDNode *SRLI = CurDAG->getMachineNode(
+ RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
+ CurDAG->getTargetConstant(Leading, DL, VT));
ReplaceNode(Node, SRLI);
return;
}
@@ -965,9 +853,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
// Turn (and (shr x, c2), c1) -> (slli (srli x, c2+c3), c3) if c1 is a
// shifted mask with c2 leading zeros and c3 trailing zeros.
if (!LeftShift && isShiftedMask_64(C1)) {
- uint64_t Leading = XLen - (64 - countLeadingZeros(C1));
- uint64_t C3 = countTrailingZeros(C1);
- if (Leading == C2 && C2 + C3 < XLen && OneUseOrZExtW && !IsCANDI) {
+ unsigned Leading = XLen - (64 - countLeadingZeros(C1));
+ unsigned Trailing = countTrailingZeros(C1);
+ if (Leading == C2 && C2 + Trailing < XLen && OneUseOrZExtW && !IsCANDI) {
unsigned SrliOpc = RISCV::SRLI;
// If the input is zexti32 we should use SRLIW.
if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
@@ -976,22 +864,23 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
X = X.getOperand(0);
}
SDNode *SRLI = CurDAG->getMachineNode(
- SrliOpc, DL, VT, X, CurDAG->getTargetConstant(C2 + C3, DL, VT));
+ SrliOpc, DL, VT, X,
+ CurDAG->getTargetConstant(C2 + Trailing, DL, VT));
SDNode *SLLI =
CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLI, 0),
- CurDAG->getTargetConstant(C3, DL, VT));
+ CurDAG->getTargetConstant(Trailing, DL, VT));
ReplaceNode(Node, SLLI);
return;
}
// If the leading zero count is C2+32, we can use SRLIW instead of SRLI.
- if (Leading > 32 && (Leading - 32) == C2 && C2 + C3 < 32 &&
+ if (Leading > 32 && (Leading - 32) == C2 && C2 + Trailing < 32 &&
OneUseOrZExtW && !IsCANDI) {
- SDNode *SRLIW =
- CurDAG->getMachineNode(RISCV::SRLIW, DL, VT, X,
- CurDAG->getTargetConstant(C2 + C3, DL, VT));
+ SDNode *SRLIW = CurDAG->getMachineNode(
+ RISCV::SRLIW, DL, VT, X,
+ CurDAG->getTargetConstant(C2 + Trailing, DL, VT));
SDNode *SLLI =
CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLIW, 0),
- CurDAG->getTargetConstant(C3, DL, VT));
+ CurDAG->getTargetConstant(Trailing, DL, VT));
ReplaceNode(Node, SLLI);
return;
}
@@ -1000,25 +889,26 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
// Turn (and (shl x, c2), c1) -> (slli (srli x, c3-c2), c3) if c1 is a
// shifted mask with no leading zeros and c3 trailing zeros.
if (LeftShift && isShiftedMask_64(C1)) {
- uint64_t Leading = XLen - (64 - countLeadingZeros(C1));
- uint64_t C3 = countTrailingZeros(C1);
- if (Leading == 0 && C2 < C3 && OneUseOrZExtW && !IsCANDI) {
+ unsigned Leading = XLen - (64 - countLeadingZeros(C1));
+ unsigned Trailing = countTrailingZeros(C1);
+ if (Leading == 0 && C2 < Trailing && OneUseOrZExtW && !IsCANDI) {
SDNode *SRLI = CurDAG->getMachineNode(
- RISCV::SRLI, DL, VT, X, CurDAG->getTargetConstant(C3 - C2, DL, VT));
+ RISCV::SRLI, DL, VT, X,
+ CurDAG->getTargetConstant(Trailing - C2, DL, VT));
SDNode *SLLI =
CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLI, 0),
- CurDAG->getTargetConstant(C3, DL, VT));
+ CurDAG->getTargetConstant(Trailing, DL, VT));
ReplaceNode(Node, SLLI);
return;
}
// If we have (32-C2) leading zeros, we can use SRLIW instead of SRLI.
- if (C2 < C3 && Leading + C2 == 32 && OneUseOrZExtW && !IsCANDI) {
- SDNode *SRLIW =
- CurDAG->getMachineNode(RISCV::SRLIW, DL, VT, X,
- CurDAG->getTargetConstant(C3 - C2, DL, VT));
+ if (C2 < Trailing && Leading + C2 == 32 && OneUseOrZExtW && !IsCANDI) {
+ SDNode *SRLIW = CurDAG->getMachineNode(
+ RISCV::SRLIW, DL, VT, X,
+ CurDAG->getTargetConstant(Trailing - C2, DL, VT));
SDNode *SLLI =
CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLIW, 0),
- CurDAG->getTargetConstant(C3, DL, VT));
+ CurDAG->getTargetConstant(Trailing, DL, VT));
ReplaceNode(Node, SLLI);
return;
}
@@ -1885,13 +1775,74 @@ bool RISCVDAGToDAGISel::SelectFrameAddrRegImm(SDValue Addr, SDValue &Base,
return false;
}
-bool RISCVDAGToDAGISel::SelectBaseAddr(SDValue Addr, SDValue &Base) {
- // If this is FrameIndex, select it directly. Otherwise just let it get
- // selected to a register independently.
- if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr))
- Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT());
- else
- Base = Addr;
+// Fold constant addresses.
+static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL,
+ const MVT VT, const RISCVSubtarget *Subtarget,
+ SDValue Addr, SDValue &Base, SDValue &Offset) {
+ if (!isa<ConstantSDNode>(Addr))
+ return false;
+
+ int64_t CVal = cast<ConstantSDNode>(Addr)->getSExtValue();
+
+ // If the constant is a simm12, we can fold the whole constant and use X0 as
+ // the base. If the constant can be materialized with LUI+simm12, use LUI as
+ // the base. We can't use generateInstSeq because it favors LUI+ADDIW.
+ int64_t Lo12 = SignExtend64<12>(CVal);
+ int64_t Hi = (uint64_t)CVal - (uint64_t)Lo12;
+ if (!Subtarget->is64Bit() || isInt<32>(Hi)) {
+ if (Hi) {
+ int64_t Hi20 = (Hi >> 12) & 0xfffff;
+ Base = SDValue(
+ CurDAG->getMachineNode(RISCV::LUI, DL, VT,
+ CurDAG->getTargetConstant(Hi20, DL, VT)),
+ 0);
+ } else {
+ Base = CurDAG->getRegister(RISCV::X0, VT);
+ }
+ Offset = CurDAG->getTargetConstant(Lo12, DL, VT);
+ return true;
+ }
+
+ // Ask how constant materialization would handle this constant.
+ RISCVMatInt::InstSeq Seq =
+ RISCVMatInt::generateInstSeq(CVal, Subtarget->getFeatureBits());
+
+ // If the last instruction would be an ADDI, we can fold its immediate and
+ // emit the rest of the sequence as the base.
+ if (Seq.back().Opc != RISCV::ADDI)
+ return false;
+ Lo12 = Seq.back().Imm;
+
+ // Drop the last instruction.
+ Seq.pop_back();
+ assert(!Seq.empty() && "Expected more instructions in sequence");
+
+ Base = SDValue(selectImmSeq(CurDAG, DL, VT, Seq), 0);
+ Offset = CurDAG->getTargetConstant(Lo12, DL, VT);
+ return true;
+}
+
+// Is this ADD instruction only used as the base pointer of scalar loads and
+// stores?
+static bool isWorthFoldingAdd(SDValue Add) {
+ for (auto Use : Add->uses()) {
+ if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
+ Use->getOpcode() != ISD::ATOMIC_LOAD &&
+ Use->getOpcode() != ISD::ATOMIC_STORE)
+ return false;
+ EVT VT = cast<MemSDNode>(Use)->getMemoryVT();
+ if (!VT.isScalarInteger() && VT != MVT::f16 && VT != MVT::f32 &&
+ VT != MVT::f64)
+ return false;
+ // Don't allow stores of the value. It must be used as the address.
+ if (Use->getOpcode() == ISD::STORE &&
+ cast<StoreSDNode>(Use)->getValue() == Add)
+ return false;
+ if (Use->getOpcode() == ISD::ATOMIC_STORE &&
+ cast<AtomicSDNode>(Use)->getVal() == Add)
+ return false;
+ }
+
return true;
}
@@ -1947,9 +1898,10 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
int64_t CVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
assert(!isInt<12>(CVal) && "simm12 not already handled?");
+ // Handle immediates in the range [-4096,-2049] or [2048, 4094]. We can use
+ // an ADDI for part of the offset and fold the rest into the load/store.
+ // This mirrors the AddiPair PatFrag in RISCVInstrInfo.td.
if (isInt<12>(CVal / 2) && isInt<12>(CVal - CVal / 2)) {
- // We can use an ADDI for part of the offset and fold the rest into the
- // load/store. This mirrors the AddiPair PatFrag in RISCVInstrInfo.td.
int64_t Adj = CVal < 0 ? -2048 : 2047;
Base = SDValue(
CurDAG->getMachineNode(RISCV::ADDI, DL, VT, Addr.getOperand(0),
@@ -1958,8 +1910,27 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
Offset = CurDAG->getTargetConstant(CVal - Adj, DL, VT);
return true;
}
+
+ // For larger immediates, we might be able to save one instruction from
+ // constant materialization by folding the Lo12 bits of the immediate into
+ // the address. We should only do this if the ADD is only used by loads and
+ // stores that can fold the lo12 bits. Otherwise, the ADD will get iseled
+ // separately with the full materialized immediate creating extra
+ // instructions.
+ if (isWorthFoldingAdd(Addr) &&
+ selectConstantAddr(CurDAG, DL, VT, Subtarget, Addr.getOperand(1), Base,
+ Offset)) {
+ // Insert an ADD instruction with the materialized Hi52 bits.
+ Base = SDValue(
+ CurDAG->getMachineNode(RISCV::ADD, DL, VT, Addr.getOperand(0), Base),
+ 0);
+ return true;
+ }
}
+ if (selectConstantAddr(CurDAG, DL, VT, Subtarget, Addr, Base, Offset))
+ return true;
+
Base = Addr;
Offset = CurDAG->getTargetConstant(0, DL, VT);
return true;
@@ -2044,6 +2015,101 @@ bool RISCVDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) {
return false;
}
+/// Look for various patterns that can be done with a SHL that can be folded
+/// into a SHXADD. \p ShAmt contains 1, 2, or 3 and is set based on which
+/// SHXADD we are trying to match.
+bool RISCVDAGToDAGISel::selectSHXADDOp(SDValue N, unsigned ShAmt,
+ SDValue &Val) {
+ if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
+ SDValue N0 = N.getOperand(0);
+
+ bool LeftShift = N0.getOpcode() == ISD::SHL;
+ if ((LeftShift || N0.getOpcode() == ISD::SRL) &&
+ isa<ConstantSDNode>(N0.getOperand(1))) {
+ uint64_t Mask = N.getConstantOperandVal(1);
+ unsigned C2 = N0.getConstantOperandVal(1);
+
+ unsigned XLen = Subtarget->getXLen();
+ if (LeftShift)
+ Mask &= maskTrailingZeros<uint64_t>(C2);
+ else
+ Mask &= maskTrailingOnes<uint64_t>(XLen - C2);
+
+ // Look for (and (shl y, c2), c1) where c1 is a shifted mask with no
+ // leading zeros and c3 trailing zeros. We can use an SRLI by c2+c3
+ // followed by a SHXADD with c3 for the X amount.
+ if (isShiftedMask_64(Mask)) {
+ unsigned Leading = XLen - (64 - countLeadingZeros(Mask));
+ unsigned Trailing = countTrailingZeros(Mask);
+ if (LeftShift && Leading == 0 && C2 < Trailing && Trailing == ShAmt) {
+ SDLoc DL(N);
+ EVT VT = N.getValueType();
+ Val = SDValue(CurDAG->getMachineNode(
+ RISCV::SRLI, DL, VT, N0.getOperand(0),
+ CurDAG->getTargetConstant(Trailing - C2, DL, VT)),
+ 0);
+ return true;
+ }
+ // Look for (and (shr y, c2), c1) where c1 is a shifted mask with c2
+ // leading zeros and c3 trailing zeros. We can use an SRLI by C3
+ // followed by a SHXADD using c3 for the X amount.
+ if (!LeftShift && Leading == C2 && Trailing == ShAmt) {
+ SDLoc DL(N);
+ EVT VT = N.getValueType();
+ Val = SDValue(
+ CurDAG->getMachineNode(
+ RISCV::SRLI, DL, VT, N0.getOperand(0),
+ CurDAG->getTargetConstant(Leading + Trailing, DL, VT)),
+ 0);
+ return true;
+ }
+ }
+ }
+ }
+
+ bool LeftShift = N.getOpcode() == ISD::SHL;
+ if ((LeftShift || N.getOpcode() == ISD::SRL) &&
+ isa<ConstantSDNode>(N.getOperand(1))) {
+ SDValue N0 = N.getOperand(0);
+ if (N0.getOpcode() == ISD::AND && N0.hasOneUse() &&
+ isa<ConstantSDNode>(N0.getOperand(1))) {
+ uint64_t Mask = N0.getConstantOperandVal(1);
+ if (isShiftedMask_64(Mask)) {
+ unsigned C1 = N.getConstantOperandVal(1);
+ unsigned XLen = Subtarget->getXLen();
+ unsigned Leading = XLen - (64 - countLeadingZeros(Mask));
+ unsigned Trailing = countTrailingZeros(Mask);
+ // Look for (shl (and X, Mask), C1) where Mask has 32 leading zeros and
+ // C3 trailing zeros. If C1+C3==ShAmt we can use SRLIW+SHXADD.
+ if (LeftShift && Leading == 32 && Trailing > 0 &&
+ (Trailing + C1) == ShAmt) {
+ SDLoc DL(N);
+ EVT VT = N.getValueType();
+ Val = SDValue(CurDAG->getMachineNode(
+ RISCV::SRLIW, DL, VT, N0.getOperand(0),
+ CurDAG->getTargetConstant(Trailing, DL, VT)),
+ 0);
+ return true;
+ }
+ // Look for (srl (and X, Mask), C1) where Mask has 32 leading zeros and
+ // C3 trailing zeros. If C3-C1==ShAmt we can use SRLIW+SHXADD.
+ if (!LeftShift && Leading == 32 && Trailing > C1 &&
+ (Trailing - C1) == ShAmt) {
+ SDLoc DL(N);
+ EVT VT = N.getValueType();
+ Val = SDValue(CurDAG->getMachineNode(
+ RISCV::SRLIW, DL, VT, N0.getOperand(0),
+ CurDAG->getTargetConstant(Trailing, DL, VT)),
+ 0);
+ return true;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
// Return true if all users of this SDNode* only consume the lower \p Bits.
// This can be used to form W instructions for add/sub/mul/shl even when the
// root isn't a sext_inreg. This can allow the ADDW/SUBW/MULW/SLLIW to CSE if
@@ -2271,102 +2337,6 @@ bool RISCVDAGToDAGISel::selectRVVSimm5(SDValue N, unsigned Width,
return false;
}
-// Merge an ADDI into the offset of a load/store instruction where possible.
-// (load (addi base, off1), off2) -> (load base, off1+off2)
-// (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
-// (load (add base, (addi src, off1)), off2)
-// -> (load (add base, src), off1+off2)
-// (store val, (add base, (addi src, off1)), off2)
-// -> (store val, (add base, src), off1+off2)
-// This is possible when off1+off2 fits a 12-bit immediate.
-bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
- unsigned OffsetOpIdx, BaseOpIdx;
- if (!hasMemOffset(N, BaseOpIdx, OffsetOpIdx))
- return false;
-
- if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
- return false;
-
- SDValue Base = N->getOperand(BaseOpIdx);
-
- if (!Base.isMachineOpcode())
- return false;
-
- if (Base.getMachineOpcode() == RISCV::ADDI) {
- // If the base is an ADDI, we can merge it in to the load/store.
- } else if (Base.getMachineOpcode() == RISCV::ADDIW &&
- isa<ConstantSDNode>(Base.getOperand(1)) &&
- Base.getOperand(0).isMachineOpcode() &&
- Base.getOperand(0).getMachineOpcode() == RISCV::LUI &&
- isa<ConstantSDNode>(Base.getOperand(0).getOperand(0))) {
- // ADDIW can be merged if it's part of LUI+ADDIW constant materialization
- // and LUI+ADDI would have produced the same result. This is true for all
- // simm32 values except 0x7ffff800-0x7fffffff.
- int64_t Offset =
- SignExtend64<32>(Base.getOperand(0).getConstantOperandVal(0) << 12);
- Offset += cast<ConstantSDNode>(Base.getOperand(1))->getSExtValue();
- if (!isInt<32>(Offset))
- return false;
- } else
- return false;
-
- SDValue ImmOperand = Base.getOperand(1);
- uint64_t Offset2 = N->getConstantOperandVal(OffsetOpIdx);
-
- if (auto *Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
- int64_t Offset1 = Const->getSExtValue();
- int64_t CombinedOffset = Offset1 + Offset2;
- if (!isInt<12>(CombinedOffset))
- return false;
- ImmOperand = CurDAG->getTargetConstant(CombinedOffset, SDLoc(ImmOperand),
- ImmOperand.getValueType());
- } else if (auto *GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
- // If the off1 in (addi base, off1) is a global variable's address (its
- // low part, really), then we can rely on the alignment of that variable
- // to provide a margin of safety before off1 can overflow the 12 bits.
- // Check if off2 falls within that margin; if so off1+off2 can't overflow.
- const DataLayout &DL = CurDAG->getDataLayout();
- Align Alignment = commonAlignment(GA->getGlobal()->getPointerAlignment(DL),
- GA->getOffset());
- if (Offset2 != 0 && Alignment <= Offset2)
- return false;
- int64_t Offset1 = GA->getOffset();
- int64_t CombinedOffset = Offset1 + Offset2;
- ImmOperand = CurDAG->getTargetGlobalAddress(
- GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
- CombinedOffset, GA->getTargetFlags());
- } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) {
- // Ditto.
- Align Alignment = commonAlignment(CP->getAlign(), CP->getOffset());
- if (Offset2 != 0 && Alignment <= Offset2)
- return false;
- int64_t Offset1 = CP->getOffset();
- int64_t CombinedOffset = Offset1 + Offset2;
- ImmOperand = CurDAG->getTargetConstantPool(
- CP->getConstVal(), ImmOperand.getValueType(), CP->getAlign(),
- CombinedOffset, CP->getTargetFlags());
- } else {
- return false;
- }
-
- LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: ");
- LLVM_DEBUG(Base->dump(CurDAG));
- LLVM_DEBUG(dbgs() << "\nN: ");
- LLVM_DEBUG(N->dump(CurDAG));
- LLVM_DEBUG(dbgs() << "\n");
-
- // Modify the offset operand of the load/store.
- if (BaseOpIdx == 0) { // Load
- N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
- N->getOperand(2));
- } else { // Store
- N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
- ImmOperand, N->getOperand(3));
- }
-
- return true;
-}
-
// Try to remove sext.w if the input is a W instruction or can be made into
// a W instruction cheaply.
bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index b50927cfcca5..ef46204c00ac 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -47,7 +47,6 @@ public:
bool SelectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectFrameAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset);
- bool SelectBaseAddr(SDValue Addr, SDValue &Base);
bool SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset);
bool selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt);
@@ -61,6 +60,17 @@ public:
bool selectSExti32(SDValue N, SDValue &Val);
bool selectZExti32(SDValue N, SDValue &Val);
+ bool selectSHXADDOp(SDValue N, unsigned ShAmt, SDValue &Val);
+ bool selectSH1ADDOp(SDValue N, SDValue &Val) {
+ return selectSHXADDOp(N, 1, Val);
+ }
+ bool selectSH2ADDOp(SDValue N, SDValue &Val) {
+ return selectSHXADDOp(N, 2, Val);
+ }
+ bool selectSH3ADDOp(SDValue N, SDValue &Val) {
+ return selectSHXADDOp(N, 3, Val);
+ }
+
bool hasAllNBitUsers(SDNode *Node, unsigned Bits) const;
bool hasAllHUsers(SDNode *Node) const { return hasAllNBitUsers(Node, 16); }
bool hasAllWUsers(SDNode *Node) const { return hasAllNBitUsers(Node, 32); }
@@ -118,7 +128,6 @@ public:
#include "RISCVGenDAGISel.inc"
private:
- bool doPeepholeLoadStoreADDI(SDNode *Node);
bool doPeepholeSExtW(SDNode *Node);
bool doPeepholeMaskedRVV(SDNode *Node);
};
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ff645dea4e7a..658865703079 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -526,6 +526,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
{ISD::VP_FPTOSI, ISD::VP_FPTOUI, ISD::VP_TRUNCATE, ISD::VP_SETCC}, VT,
Custom);
setOperationAction(ISD::VECTOR_REVERSE, VT, Custom);
+
+ setOperationPromotedToType(
+ ISD::VECTOR_SPLICE, VT,
+ MVT::getVectorVT(MVT::i8, VT.getVectorElementCount()));
}
for (MVT VT : IntVecVTs) {
@@ -1157,6 +1161,37 @@ bool RISCVTargetLowering::hasBitTest(SDValue X, SDValue Y) const {
return C && C->getAPIntValue().ule(10);
}
+bool RISCVTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const {
+ assert(Ty->isIntegerTy());
+
+ unsigned BitSize = Ty->getIntegerBitWidth();
+ if (BitSize > Subtarget.getXLen())
+ return false;
+
+ // Fast path, assume 32-bit immediates are cheap.
+ int64_t Val = Imm.getSExtValue();
+ if (isInt<32>(Val))
+ return true;
+
+ // A constant pool entry may be more aligned thant he load we're trying to
+ // replace. If we don't support unaligned scalar mem, prefer the constant
+ // pool.
+ // TODO: Can the caller pass down the alignment?
+ if (!Subtarget.enableUnalignedScalarMem())
+ return true;
+
+ // Prefer to keep the load if it would require many instructions.
+ // This uses the same threshold we use for constant pools but doesn't
+ // check useConstantPoolForLargeInts.
+ // TODO: Should we keep the load only when we're definitely going to emit a
+ // constant pool?
+
+ RISCVMatInt::InstSeq Seq =
+ RISCVMatInt::generateInstSeq(Val, Subtarget.getFeatureBits());
+ return Seq.size() <= Subtarget.getMaxBuildIntsCost();
+}
+
bool RISCVTargetLowering::
shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
@@ -1659,7 +1694,7 @@ static SDValue convertFromScalableVector(EVT VT, SDValue V, SelectionDAG &DAG,
/// Return the type of the mask type suitable for masking the provided
/// vector type. This is simply an i1 element type vector of the same
/// (possibly scalable) length.
-static MVT getMaskTypeFor(EVT VecVT) {
+static MVT getMaskTypeFor(MVT VecVT) {
assert(VecVT.isVector());
ElementCount EC = VecVT.getVectorElementCount();
return MVT::getVectorVT(MVT::i1, EC);
@@ -5748,8 +5783,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_SPLICE(SDValue Op,
DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, VecVT, DAG.getUNDEF(VecVT), V1,
DownOffset, TrueMask, UpOffset);
return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VecVT, SlideDown, V2, UpOffset,
- TrueMask,
- DAG.getTargetConstant(RISCV::VLMaxSentinel, DL, XLenVT));
+ TrueMask, DAG.getRegister(RISCV::X0, XLenVT));
}
SDValue
@@ -8530,12 +8564,6 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
return Opcode;
}
-// Combine (sra (shl X, 32), 32 - C) -> (shl (sext_inreg X, i32), C)
-// FIXME: Should this be a generic combine? There's a similar combine on X86.
-//
-// Also try these folds where an add or sub is in the middle.
-// (sra (add (shl X, 32), C1), 32 - C) -> (shl (sext_inreg (add X, C1), C)
-// (sra (sub C1, (shl X, 32)), 32 - C) -> (shl (sext_inreg (sub C1, X), C)
static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
assert(N->getOpcode() == ISD::SRA && "Unexpected opcode");
@@ -8543,12 +8571,40 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
if (N->getValueType(0) != MVT::i64 || !Subtarget.is64Bit())
return SDValue();
- auto *ShAmtC = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!ShAmtC || ShAmtC->getZExtValue() > 32)
+ if (!isa<ConstantSDNode>(N->getOperand(1)))
+ return SDValue();
+ uint64_t ShAmt = N->getConstantOperandVal(1);
+ if (ShAmt > 32)
return SDValue();
SDValue N0 = N->getOperand(0);
+ // Combine (sra (sext_inreg (shl X, C1), i32), C2) ->
+ // (sra (shl X, C1+32), C2+32) so it gets selected as SLLI+SRAI instead of
+ // SLLIW+SRAIW. SLLI+SRAI have compressed forms.
+ if (ShAmt < 32 &&
+ N0.getOpcode() == ISD::SIGN_EXTEND_INREG && N0.hasOneUse() &&
+ cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i32 &&
+ N0.getOperand(0).getOpcode() == ISD::SHL && N0.getOperand(0).hasOneUse() &&
+ isa<ConstantSDNode>(N0.getOperand(0).getOperand(1))) {
+ uint64_t LShAmt = N0.getOperand(0).getConstantOperandVal(1);
+ if (LShAmt < 32) {
+ SDLoc ShlDL(N0.getOperand(0));
+ SDValue Shl = DAG.getNode(ISD::SHL, ShlDL, MVT::i64,
+ N0.getOperand(0).getOperand(0),
+ DAG.getConstant(LShAmt + 32, ShlDL, MVT::i64));
+ SDLoc DL(N);
+ return DAG.getNode(ISD::SRA, DL, MVT::i64, Shl,
+ DAG.getConstant(ShAmt + 32, DL, MVT::i64));
+ }
+ }
+
+ // Combine (sra (shl X, 32), 32 - C) -> (shl (sext_inreg X, i32), C)
+ // FIXME: Should this be a generic combine? There's a similar combine on X86.
+ //
+ // Also try these folds where an add or sub is in the middle.
+ // (sra (add (shl X, 32), C1), 32 - C) -> (shl (sext_inreg (add X, C1), C)
+ // (sra (sub C1, (shl X, 32)), 32 - C) -> (shl (sext_inreg (sub C1, X), C)
SDValue Shl;
ConstantSDNode *AddC = nullptr;
@@ -8594,12 +8650,12 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, In,
DAG.getValueType(MVT::i32));
- if (ShAmtC->getZExtValue() == 32)
+ if (ShAmt == 32)
return SExt;
return DAG.getNode(
ISD::SHL, DL, MVT::i64, SExt,
- DAG.getConstant(32 - ShAmtC->getZExtValue(), DL, MVT::i64));
+ DAG.getConstant(32 - ShAmt, DL, MVT::i64));
}
SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
@@ -9152,10 +9208,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
// FIXME: Support FP.
if (Val.getOpcode() == RISCVISD::VMV_X_S) {
SDValue Src = Val.getOperand(0);
- EVT VecVT = Src.getValueType();
+ MVT VecVT = Src.getSimpleValueType();
EVT MemVT = Store->getMemoryVT();
// The memory VT and the element type must match.
- if (VecVT.getVectorElementType() == MemVT) {
+ if (MemVT == VecVT.getVectorElementType()) {
SDLoc DL(N);
MVT MaskVT = getMaskTypeFor(VecVT);
return DAG.getStoreVP(
@@ -9864,7 +9920,7 @@ EmitLoweredCascadedSelect(MachineInstr &First, MachineInstr &Second,
Register FLHS = First.getOperand(1).getReg();
Register FRHS = First.getOperand(2).getReg();
// Insert appropriate branch.
- BuildMI(ThisMBB, DL, TII.getBrCond(FirstCC))
+ BuildMI(FirstMBB, DL, TII.getBrCond(FirstCC))
.addReg(FLHS)
.addReg(FRHS)
.addMBB(SinkMBB);
@@ -9876,7 +9932,7 @@ EmitLoweredCascadedSelect(MachineInstr &First, MachineInstr &Second,
auto SecondCC = static_cast<RISCVCC::CondCode>(Second.getOperand(3).getImm());
// Insert appropriate branch.
- BuildMI(FirstMBB, DL, TII.getBrCond(SecondCC))
+ BuildMI(ThisMBB, DL, TII.getBrCond(SecondCC))
.addReg(SLHS)
.addReg(SRHS)
.addMBB(SinkMBB);
@@ -9884,9 +9940,9 @@ EmitLoweredCascadedSelect(MachineInstr &First, MachineInstr &Second,
Register DestReg = Second.getOperand(0).getReg();
Register Op2Reg4 = Second.getOperand(4).getReg();
BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII.get(RISCV::PHI), DestReg)
- .addReg(Op1Reg4)
- .addMBB(ThisMBB)
.addReg(Op2Reg4)
+ .addMBB(ThisMBB)
+ .addReg(Op1Reg4)
.addMBB(FirstMBB)
.addReg(Op1Reg5)
.addMBB(SecondMBB);
@@ -12096,6 +12152,17 @@ const MCExpr *RISCVTargetLowering::LowerCustomJumpTableEntry(
return MCSymbolRefExpr::create(MBB->getSymbol(), Ctx);
}
+bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const {
+ // We define vscale to be VLEN/RVVBitsPerBlock. VLEN is always a power
+ // of two >= 64, and RVVBitsPerBlock is 64. Thus, vscale must be
+ // a power of two as well.
+ // FIXME: This doesn't work for zve32, but that's already broken
+ // elsewhere for the same reason.
+ assert(Subtarget.getRealMinVLen() >= 64 && "zve32* unsupported");
+ assert(RISCV::RVVBitsPerBlock == 64 && "RVVBitsPerBlock changed, audit needed");
+ return true;
+}
+
bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const {
VT = VT.getScalarType();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index eb013d4b6682..5e15176de59c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -520,9 +520,7 @@ public:
SmallVectorImpl<SDValue> &InVals) const override;
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
- Type *Ty) const override {
- return true;
- }
+ Type *Ty) const override;
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
bool shouldConsiderGEPOffsetSplit() const override { return true; }
@@ -599,6 +597,8 @@ public:
unsigned uid,
MCContext &Ctx) const override;
+ bool isVScaleKnownToBeAPowerOfTwo() const override;
+
private:
/// RISCVCCAssignFn - This target-specific function extends the default
/// CCValAssign with additional information used to lower RISC-V calling
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index ee4c026af8f4..06a90438838e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -384,7 +384,6 @@ def uimm6gt32 : ImmLeaf<XLenVT, [{
// Necessary because a frameindex can't be matched directly in a pattern.
def FrameAddrRegImm : ComplexPattern<iPTR, 2, "SelectFrameAddrRegImm",
[frameindex, or, add]>;
-def BaseAddr : ComplexPattern<iPTR, 1, "SelectBaseAddr">;
def AddrRegImm : ComplexPattern<iPTR, 2, "SelectAddrRegImm">;
// Return the negation of an immediate value.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index f8bc241039f8..1ad634344c09 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -115,6 +115,35 @@ class VSXSched<int n, string o> :
class VLFSched<int n> : Sched <[!cast<SchedReadWrite>("WriteVLDFF" # n),
ReadVLDX, ReadVMask]>;
+// Unit-Stride Segment Loads and Stores
+class VLSEGSched<int nf, int eew> : Sched<[
+ !cast<SchedReadWrite>("WriteVLSEG" #nf #"e" #eew), ReadVLDX, ReadVMask]>;
+class VSSEGSched<int nf, int eew> : Sched<[
+ !cast<SchedReadWrite>("WriteVSSEG" #nf #"e" #eew),
+ !cast<SchedReadWrite>("ReadVSTE" #eew #"V"), ReadVSTX, ReadVMask]>;
+class VLSEGFFSched<int nf, int eew> : Sched<[
+ !cast<SchedReadWrite>("WriteVLSEGFF" #nf #"e" #eew), ReadVLDX, ReadVMask]>;
+// Strided Segment Loads and Stores
+class VLSSEGSched<int nf, int eew> : Sched<[
+ !cast<SchedReadWrite>("WriteVLSSEG" #nf #"e" #eew), ReadVLDX, ReadVLDSX,
+ ReadVMask]>;
+class VSSSEGSched<int nf, int eew> : Sched<[
+ !cast<SchedReadWrite>("WriteVSSSEG" #nf #"e" #eew),
+ !cast<SchedReadWrite>("ReadVSTS" #eew #"V"), ReadVSTX, ReadVSTSX, ReadVMask]>;
+// Indexed Segment Loads and Stores
+class VLUXSEGSched<int nf, int eew> : Sched<[
+ !cast<SchedReadWrite>("WriteVLUXSEG" #nf #"e" #eew), ReadVLDX, ReadVLDUXV,
+ ReadVMask]>;
+class VLOXSEGSched<int nf, int eew> : Sched<[
+ !cast<SchedReadWrite>("WriteVLOXSEG" #nf #"e" #eew), ReadVLDX, ReadVLDOXV,
+ ReadVMask]>;
+class VSUXSEGSched<int nf, int eew> : Sched<[
+ !cast<SchedReadWrite>("WriteVSUXSEG" #nf #"e" #eew),
+ !cast<SchedReadWrite>("ReadVSTUX" #eew), ReadVSTX, ReadVSTUXV, ReadVMask]>;
+class VSOXSEGSched<int nf, int eew> : Sched<[
+ !cast<SchedReadWrite>("WriteVSOXSEG" #nf #"e" #eew),
+ !cast<SchedReadWrite>("ReadVSTOX" #eew), ReadVSTX, ReadVSTOXV, ReadVMask]>;
+
//===----------------------------------------------------------------------===//
// Instruction class templates
//===----------------------------------------------------------------------===//
@@ -1476,14 +1505,9 @@ defm VCOMPRESS_V : VCPR_MV_Mask<"vcompress", 0b010111>;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
RVVConstraint = NoConstraint in {
-def VMV1R_V : RVInstV<0b100111, 0, OPIVI, (outs VR:$vd), (ins VR:$vs2),
- "vmv1r.v", "$vd, $vs2">, VMVRSched<1> {
- let Uses = [];
- let vm = 1;
-}
// A future extension may relax the vector register alignment restrictions.
-foreach n = [2, 4, 8] in {
- defvar vrc = !cast<VReg>("VRM"#n);
+foreach n = [1, 2, 4, 8] in {
+ defvar vrc = !cast<VReg>(!if(!eq(n, 1), "VR", "VRM"#n));
def VMV#n#R_V : RVInstV<0b100111, !add(n, -1), OPIVI, (outs vrc:$vd),
(ins vrc:$vs2), "vmv" # n # "r.v", "$vd, $vs2">,
VMVRSched<n> {
@@ -1500,31 +1524,35 @@ let Predicates = [HasVInstructions] in {
defvar w = !cast<RISCVWidth>("LSWidth"#eew);
def VLSEG#nf#E#eew#_V :
- VUnitStrideSegmentLoad<!add(nf, -1), w, "vlseg"#nf#"e"#eew#".v">;
+ VUnitStrideSegmentLoad<!add(nf, -1), w, "vlseg"#nf#"e"#eew#".v">,
+ VLSEGSched<nf, eew>;
def VLSEG#nf#E#eew#FF_V :
- VUnitStrideSegmentLoadFF<!add(nf, -1), w, "vlseg"#nf#"e"#eew#"ff.v">;
+ VUnitStrideSegmentLoadFF<!add(nf, -1), w, "vlseg"#nf#"e"#eew#"ff.v">,
+ VLSEGFFSched<nf, eew>;
def VSSEG#nf#E#eew#_V :
- VUnitStrideSegmentStore<!add(nf, -1), w, "vsseg"#nf#"e"#eew#".v">;
-
+ VUnitStrideSegmentStore<!add(nf, -1), w, "vsseg"#nf#"e"#eew#".v">,
+ VSSEGSched<nf, eew>;
// Vector Strided Instructions
def VLSSEG#nf#E#eew#_V :
- VStridedSegmentLoad<!add(nf, -1), w, "vlsseg"#nf#"e"#eew#".v">;
+ VStridedSegmentLoad<!add(nf, -1), w, "vlsseg"#nf#"e"#eew#".v">,
+ VLSSEGSched<nf, eew>;
def VSSSEG#nf#E#eew#_V :
- VStridedSegmentStore<!add(nf, -1), w, "vssseg"#nf#"e"#eew#".v">;
+ VStridedSegmentStore<!add(nf, -1), w, "vssseg"#nf#"e"#eew#".v">,
+ VSSSEGSched<nf, eew>;
// Vector Indexed Instructions
def VLUXSEG#nf#EI#eew#_V :
VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord, w,
- "vluxseg"#nf#"ei"#eew#".v">;
+ "vluxseg"#nf#"ei"#eew#".v">, VLUXSEGSched<nf, eew>;
def VLOXSEG#nf#EI#eew#_V :
VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder, w,
- "vloxseg"#nf#"ei"#eew#".v">;
+ "vloxseg"#nf#"ei"#eew#".v">, VLOXSEGSched<nf, eew>;
def VSUXSEG#nf#EI#eew#_V :
VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord, w,
- "vsuxseg"#nf#"ei"#eew#".v">;
+ "vsuxseg"#nf#"ei"#eew#".v">, VSUXSEGSched<nf, eew>;
def VSOXSEG#nf#EI#eew#_V :
VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder, w,
- "vsoxseg"#nf#"ei"#eew#".v">;
+ "vsoxseg"#nf#"ei"#eew#".v">, VSOXSEGSched<nf, eew>;
}
}
} // Predicates = [HasVInstructions]
@@ -1533,17 +1561,22 @@ let Predicates = [HasVInstructionsI64] in {
foreach nf=2-8 in {
// Vector Unit-strided Segment Instructions
def VLSEG#nf#E64_V :
- VUnitStrideSegmentLoad<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64.v">;
+ VUnitStrideSegmentLoad<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64.v">,
+ VLSEGSched<nf, 64>;
def VLSEG#nf#E64FF_V :
- VUnitStrideSegmentLoadFF<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64ff.v">;
+ VUnitStrideSegmentLoadFF<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64ff.v">,
+ VLSEGFFSched<nf, 64>;
def VSSEG#nf#E64_V :
- VUnitStrideSegmentStore<!add(nf, -1), LSWidth64, "vsseg"#nf#"e64.v">;
+ VUnitStrideSegmentStore<!add(nf, -1), LSWidth64, "vsseg"#nf#"e64.v">,
+ VSSEGSched<nf, 64>;
// Vector Strided Segment Instructions
def VLSSEG#nf#E64_V :
- VStridedSegmentLoad<!add(nf, -1), LSWidth64, "vlsseg"#nf#"e64.v">;
+ VStridedSegmentLoad<!add(nf, -1), LSWidth64, "vlsseg"#nf#"e64.v">,
+ VLSSEGSched<nf, 64>;
def VSSSEG#nf#E64_V :
- VStridedSegmentStore<!add(nf, -1), LSWidth64, "vssseg"#nf#"e64.v">;
+ VStridedSegmentStore<!add(nf, -1), LSWidth64, "vssseg"#nf#"e64.v">,
+ VSSSEGSched<nf, 64>;
}
} // Predicates = [HasVInstructionsI64]
let Predicates = [HasVInstructionsI64, IsRV64] in {
@@ -1551,16 +1584,16 @@ let Predicates = [HasVInstructionsI64, IsRV64] in {
// Vector Indexed Segment Instructions
def VLUXSEG#nf#EI64_V :
VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord, LSWidth64,
- "vluxseg"#nf#"ei64.v">;
+ "vluxseg"#nf#"ei64.v">, VLUXSEGSched<nf, 64>;
def VLOXSEG#nf#EI64_V :
VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder, LSWidth64,
- "vloxseg"#nf#"ei64.v">;
+ "vloxseg"#nf#"ei64.v">, VLOXSEGSched<nf, 64>;
def VSUXSEG#nf#EI64_V :
VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord, LSWidth64,
- "vsuxseg"#nf#"ei64.v">;
+ "vsuxseg"#nf#"ei64.v">, VSUXSEGSched<nf, 64>;
def VSOXSEG#nf#EI64_V :
VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder, LSWidth64,
- "vsoxseg"#nf#"ei64.v">;
+ "vsoxseg"#nf#"ei64.v">, VSOXSEGSched<nf, 64>;
}
} // Predicates = [HasVInstructionsI64, IsRV64]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index 06d4c4d0a9e6..b7b25643e397 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -34,11 +34,11 @@ multiclass VPatUSLoadStoreSDNode<ValueType type,
defvar load_instr = !cast<Instruction>("PseudoVLE"#sew#"_V_"#vlmul.MX);
defvar store_instr = !cast<Instruction>("PseudoVSE"#sew#"_V_"#vlmul.MX);
// Load
- def : Pat<(type (load BaseAddr:$rs1)),
- (load_instr BaseAddr:$rs1, avl, log2sew)>;
+ def : Pat<(type (load GPR:$rs1)),
+ (load_instr GPR:$rs1, avl, log2sew)>;
// Store
- def : Pat<(store type:$rs2, BaseAddr:$rs1),
- (store_instr reg_class:$rs2, BaseAddr:$rs1, avl, log2sew)>;
+ def : Pat<(store type:$rs2, GPR:$rs1),
+ (store_instr reg_class:$rs2, GPR:$rs1, avl, log2sew)>;
}
multiclass VPatUSLoadStoreWholeVRSDNode<ValueType type,
@@ -53,11 +53,11 @@ multiclass VPatUSLoadStoreWholeVRSDNode<ValueType type,
!cast<Instruction>("VS"#!substr(vlmul.MX, 1)#"R_V");
// Load
- def : Pat<(type (load BaseAddr:$rs1)),
- (load_instr BaseAddr:$rs1)>;
+ def : Pat<(type (load GPR:$rs1)),
+ (load_instr GPR:$rs1)>;
// Store
- def : Pat<(store type:$rs2, BaseAddr:$rs1),
- (store_instr reg_class:$rs2, BaseAddr:$rs1)>;
+ def : Pat<(store type:$rs2, GPR:$rs1),
+ (store_instr reg_class:$rs2, GPR:$rs1)>;
}
multiclass VPatUSLoadStoreMaskSDNode<MTypeInfo m>
@@ -65,11 +65,11 @@ multiclass VPatUSLoadStoreMaskSDNode<MTypeInfo m>
defvar load_instr = !cast<Instruction>("PseudoVLM_V_"#m.BX);
defvar store_instr = !cast<Instruction>("PseudoVSM_V_"#m.BX);
// Load
- def : Pat<(m.Mask (load BaseAddr:$rs1)),
- (load_instr BaseAddr:$rs1, m.AVL, m.Log2SEW)>;
+ def : Pat<(m.Mask (load GPR:$rs1)),
+ (load_instr GPR:$rs1, m.AVL, m.Log2SEW)>;
// Store
- def : Pat<(store m.Mask:$rs2, BaseAddr:$rs1),
- (store_instr VR:$rs2, BaseAddr:$rs1, m.AVL, m.Log2SEW)>;
+ def : Pat<(store m.Mask:$rs2, GPR:$rs1),
+ (store_instr VR:$rs2, GPR:$rs1, m.AVL, m.Log2SEW)>;
}
class VPatBinarySDNode_VV<SDNode vop,
@@ -1038,10 +1038,14 @@ let Predicates = [HasVInstructionsAnyF] in
foreach vti = AllFloatVectors in {
// Fold store of vmv.f.s to a vse with VL=1.
defvar store_instr = !cast<Instruction>("PseudoVSE"#vti.SEW#"_V_"#vti.LMul.MX);
- def : Pat<(store (vti.Scalar (int_riscv_vfmv_f_s (vti.Vector vti.RegClass:$rs2))), BaseAddr:$rs1),
- (store_instr vti.RegClass:$rs2, BaseAddr:$rs1, 1, vti.Log2SEW)>;
- def : Pat<(store (extractelt (vti.Vector vti.RegClass:$rs2), 0), BaseAddr:$rs1),
- (store_instr vti.RegClass:$rs2, BaseAddr:$rs1, 1, vti.Log2SEW)>;
+
+ let AddedComplexity = 2 in {
+ // Add complexity to increase the priority of this pattern being matched.
+ def : Pat<(store (vti.Scalar (int_riscv_vfmv_f_s (vti.Vector vti.RegClass:$rs2))), GPR:$rs1),
+ (store_instr vti.RegClass:$rs2, GPR:$rs1, 1, vti.Log2SEW)>;
+ def : Pat<(store (extractelt (vti.Vector vti.RegClass:$rs2), 0), GPR:$rs1),
+ (store_instr vti.RegClass:$rs2, GPR:$rs1, 1, vti.Log2SEW)>;
+ }
defvar vmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_",
vti.ScalarSuffix,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 081f61617d59..49306bb0f4e2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -76,13 +76,13 @@ def riscv_urem_vl : SDNode<"RISCVISD::UREM_VL", SDT_RISCVIntBinOp_VL>;
def riscv_shl_vl : SDNode<"RISCVISD::SHL_VL", SDT_RISCVIntBinOp_VL>;
def riscv_sra_vl : SDNode<"RISCVISD::SRA_VL", SDT_RISCVIntBinOp_VL>;
def riscv_srl_vl : SDNode<"RISCVISD::SRL_VL", SDT_RISCVIntBinOp_VL>;
-def riscv_smin_vl : SDNode<"RISCVISD::SMIN_VL", SDT_RISCVIntBinOp_VL>;
-def riscv_smax_vl : SDNode<"RISCVISD::SMAX_VL", SDT_RISCVIntBinOp_VL>;
-def riscv_umin_vl : SDNode<"RISCVISD::UMIN_VL", SDT_RISCVIntBinOp_VL>;
-def riscv_umax_vl : SDNode<"RISCVISD::UMAX_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_smin_vl : SDNode<"RISCVISD::SMIN_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
+def riscv_smax_vl : SDNode<"RISCVISD::SMAX_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
+def riscv_umin_vl : SDNode<"RISCVISD::UMIN_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
+def riscv_umax_vl : SDNode<"RISCVISD::UMAX_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
-def riscv_saddsat_vl : SDNode<"RISCVISD::SADDSAT_VL", SDT_RISCVIntBinOp_VL>;
-def riscv_uaddsat_vl : SDNode<"RISCVISD::UADDSAT_VL", SDT_RISCVIntBinOp_VL>;
+def riscv_saddsat_vl : SDNode<"RISCVISD::SADDSAT_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
+def riscv_uaddsat_vl : SDNode<"RISCVISD::UADDSAT_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;
def riscv_ssubsat_vl : SDNode<"RISCVISD::SSUBSAT_VL", SDT_RISCVIntBinOp_VL>;
def riscv_usubsat_vl : SDNode<"RISCVISD::USUBSAT_VL", SDT_RISCVIntBinOp_VL>;
@@ -94,8 +94,8 @@ def riscv_fneg_vl : SDNode<"RISCVISD::FNEG_VL", SDT_RISCVFPUnOp_VL>;
def riscv_fabs_vl : SDNode<"RISCVISD::FABS_VL", SDT_RISCVFPUnOp_VL>;
def riscv_fsqrt_vl : SDNode<"RISCVISD::FSQRT_VL", SDT_RISCVFPUnOp_VL>;
def riscv_fcopysign_vl : SDNode<"RISCVISD::FCOPYSIGN_VL", SDT_RISCVFPBinOp_VL>;
-def riscv_fminnum_vl : SDNode<"RISCVISD::FMINNUM_VL", SDT_RISCVFPBinOp_VL>;
-def riscv_fmaxnum_vl : SDNode<"RISCVISD::FMAXNUM_VL", SDT_RISCVFPBinOp_VL>;
+def riscv_fminnum_vl : SDNode<"RISCVISD::FMINNUM_VL", SDT_RISCVFPBinOp_VL, [SDNPCommutative]>;
+def riscv_fmaxnum_vl : SDNode<"RISCVISD::FMAXNUM_VL", SDT_RISCVFPBinOp_VL, [SDNPCommutative]>;
def SDT_RISCVVecFMA_VL : SDTypeProfile<1, 5, [SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 9532d1dd3dd2..02ae4f88d56a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -83,13 +83,13 @@ def shfl_uimm : Operand<XLenVT>, ImmLeaf<XLenVT, [{
def BCLRXForm : SDNodeXForm<imm, [{
// Find the lowest 0.
- return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingOnes(),
+ return CurDAG->getTargetConstant(countTrailingOnes(N->getZExtValue()),
SDLoc(N), N->getValueType(0));
}]>;
def BSETINVXForm : SDNodeXForm<imm, [{
// Find the lowest 1.
- return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingZeros(),
+ return CurDAG->getTargetConstant(countTrailingZeros(N->getZExtValue()),
SDLoc(N), N->getValueType(0));
}]>;
@@ -239,6 +239,10 @@ def non_imm12 : PatLeaf<(XLenVT GPR:$a), [{
return !C || !isInt<12>(C->getSExtValue());
}]>;
+def sh1add_op : ComplexPattern<XLenVT, 1, "selectSH1ADDOp", [], [], 6>;
+def sh2add_op : ComplexPattern<XLenVT, 1, "selectSH2ADDOp", [], [], 6>;
+def sh3add_op : ComplexPattern<XLenVT, 1, "selectSH3ADDOp", [], [], 6>;
+
//===----------------------------------------------------------------------===//
// Instruction class templates
//===----------------------------------------------------------------------===//
@@ -1095,6 +1099,14 @@ def : Pat<(add (shl GPR:$rs1, (XLenVT 2)), non_imm12:$rs2),
def : Pat<(add (shl GPR:$rs1, (XLenVT 3)), non_imm12:$rs2),
(SH3ADD GPR:$rs1, GPR:$rs2)>;
+// More complex cases use a ComplexPattern.
+def : Pat<(add sh1add_op:$rs1, non_imm12:$rs2),
+ (SH1ADD sh1add_op:$rs1, GPR:$rs2)>;
+def : Pat<(add sh2add_op:$rs1, non_imm12:$rs2),
+ (SH2ADD sh2add_op:$rs1, GPR:$rs2)>;
+def : Pat<(add sh3add_op:$rs1, non_imm12:$rs2),
+ (SH3ADD sh3add_op:$rs1, GPR:$rs2)>;
+
def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 6)), GPR:$rs2),
(SH1ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 10)), GPR:$rs2),
@@ -1190,18 +1202,6 @@ def : Pat<(i64 (add (and GPR:$rs1, 0x3FFFFFFFC), non_imm12:$rs2)),
(SH2ADD_UW (SRLI GPR:$rs1, 2), GPR:$rs2)>;
def : Pat<(i64 (add (and GPR:$rs1, 0x7FFFFFFF8), non_imm12:$rs2)),
(SH3ADD_UW (SRLI GPR:$rs1, 3), GPR:$rs2)>;
-
-// Use SRLIW to shift out the LSBs and zero the upper 32-bits. Use SHXADD to
-// shift zeros into the LSBs the addition shl amount.
-def : Pat<(i64 (add (shl (binop_oneuse<and> GPR:$rs1, 0xFFFFFFFE), (i64 1)),
- non_imm12:$rs2)),
- (SH2ADD (SRLIW GPR:$rs1, 1), GPR:$rs2)>;
-def : Pat<(i64 (add (shl (binop_oneuse<and> GPR:$rs1, 0xFFFFFFFE), (i64 2)),
- non_imm12:$rs2)),
- (SH3ADD (SRLIW GPR:$rs1, 1), GPR:$rs2)>;
-def : Pat<(i64 (add (shl (binop_oneuse<and> GPR:$rs1, 0xFFFFFFFC), (i64 1)),
- non_imm12:$rs2)),
- (SH3ADD (SRLIW GPR:$rs1, 2), GPR:$rs2)>;
} // Predicates = [HasStdExtZba, IsRV64]
let Predicates = [HasStdExtZbcOrZbkc] in {
diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
index 1fc424411c12..dad0aa476471 100644
--- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
@@ -293,8 +293,16 @@ static void updateOperands(MachineInstr &MI, RegImmPair OldRegImm,
assert((isCompressibleLoad(MI) || isCompressibleStore(MI)) &&
"Unsupported instruction for this optimization.");
+ int SkipN = 0;
+
+ // Skip the first (value) operand to a store instruction (except if the store
+ // offset is zero) in order to avoid an incorrect transformation.
+ // e.g. sd a0, 808(a0) to addi a2, a0, 768; sd a2, 40(a2)
+ if (isCompressibleStore(MI) && OldRegImm.Imm != 0)
+ SkipN = 1;
+
// Update registers
- for (MachineOperand &MO : MI.operands())
+ for (MachineOperand &MO : drop_begin(MI.operands(), SkipN))
if (MO.isReg() && MO.getReg() == OldRegImm.Reg) {
// Do not update operands that define the old register.
//
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index 43af1802d706..bafcf47b82e4 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -53,6 +53,20 @@ def WriteVLDFF8 : SchedWrite;
def WriteVLDFF16 : SchedWrite;
def WriteVLDFF32 : SchedWrite;
def WriteVLDFF64 : SchedWrite;
+// 7.8. Vector Segment Instructions
+foreach nf=2-8 in {
+ foreach eew = [8, 16, 32, 64] in {
+ def WriteVLSEG # nf # e # eew : SchedWrite;
+ def WriteVSSEG # nf # e # eew : SchedWrite;
+ def WriteVLSEGFF # nf # e # eew : SchedWrite;
+ def WriteVLSSEG # nf # e # eew : SchedWrite;
+ def WriteVSSSEG # nf # e # eew : SchedWrite;
+ def WriteVLUXSEG # nf # e # eew : SchedWrite;
+ def WriteVLOXSEG # nf # e # eew : SchedWrite;
+ def WriteVSUXSEG # nf # e # eew : SchedWrite;
+ def WriteVSOXSEG # nf # e # eew : SchedWrite;
+ }
+}
// 7.9. Vector Whole Register Instructions
def WriteVLD1R8 : SchedWrite;
def WriteVLD1R16 : SchedWrite;
@@ -538,6 +552,20 @@ def : WriteRes<WriteVST1R, []>;
def : WriteRes<WriteVST2R, []>;
def : WriteRes<WriteVST4R, []>;
def : WriteRes<WriteVST8R, []>;
+// Vector Segment Loads and Stores
+foreach nf=2-8 in {
+ foreach eew = [8, 16, 32, 64] in {
+ def : WriteRes <!cast<SchedWrite>("WriteVLSEG" # nf # "e" # eew), []>;
+ def : WriteRes <!cast<SchedWrite>("WriteVLSEGFF" # nf # "e" # eew), []>;
+ def : WriteRes <!cast<SchedWrite>("WriteVSSEG" # nf # "e" # eew), []>;
+ def : WriteRes <!cast<SchedWrite>("WriteVLSSEG" # nf # "e" # eew), []>;
+ def : WriteRes <!cast<SchedWrite>("WriteVSSSEG" # nf # "e" # eew), []>;
+ def : WriteRes <!cast<SchedWrite>("WriteVLUXSEG" # nf # "e" # eew), []>;
+ def : WriteRes <!cast<SchedWrite>("WriteVLOXSEG" # nf # "e" # eew), []>;
+ def : WriteRes <!cast<SchedWrite>("WriteVSUXSEG" # nf # "e" # eew), []>;
+ def : WriteRes <!cast<SchedWrite>("WriteVSOXSEG" # nf # "e" # eew), []>;
+ }
+}
// 12. Vector Integer Arithmetic Instructions
def : WriteRes<WriteVIALUV, []>;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 7caf0fedb2ca..96c46fb7554f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -57,6 +57,10 @@ public:
bool shouldExpandReduction(const IntrinsicInst *II) const;
bool supportsScalableVectors() const { return ST->hasVInstructions(); }
+ PredicationStyle emitGetActiveLaneMask() const {
+ return ST->hasVInstructions() ? PredicationStyle::Data
+ : PredicationStyle::None;
+ }
Optional<unsigned> getMaxVScale() const;
Optional<unsigned> getVScaleForTuning() const;
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp
index d953bc590473..f726f42c9bcb 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp
@@ -46,12 +46,6 @@ public:
void encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
-
-private:
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
};
} // end anonymous namespace
@@ -110,9 +104,6 @@ static void emitUntypedInstrOperands(const MCInst &MI, EndianWriter &OSE) {
void SPIRVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
- auto Features = computeAvailableFeatures(STI.getFeatureBits());
- verifyInstructionPredicates(MI, Features);
-
EndianWriter OSE(OS, support::little);
// Encode the first 32 SPIR-V bytes with the number of args and the opcode.
@@ -128,5 +119,4 @@ void SPIRVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
emitUntypedInstrOperands(MI, OSE);
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "SPIRVGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
index 6b8b4a73af92..62ce15550ae7 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
@@ -22,6 +22,7 @@
#include "llvm/MC/TargetRegistry.h"
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "SPIRVGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h
index 4009fa96aa68..abc8df34be0a 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h
@@ -44,6 +44,7 @@ std::unique_ptr<MCObjectTargetWriter> createSPIRVObjectTargetWriter();
// Defines symbolic names for the SPIR-V instructions.
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "SPIRVGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 0de232651377..605bf949187f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -215,6 +215,9 @@ void SPIRVAsmPrinter::outputInstruction(const MachineInstr *MI) {
}
void SPIRVAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ SPIRV_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
if (!MAI->getSkipEmission(MI))
outputInstruction(MI);
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index df07a126eeea..5b6b82aebf30 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -68,6 +68,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
ArrayRef<ArrayRef<Register>> VRegs,
FunctionLoweringInfo &FLI) const {
assert(GR && "Must initialize the SPIRV type registry before lowering args.");
+ GR->setCurrentFunc(MIRBuilder.getMF());
// Assign types and names to all args, and store their types for later.
SmallVector<Register, 4> ArgTypeVRegs;
@@ -114,6 +115,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
auto MRI = MIRBuilder.getMRI();
Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(32));
MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass);
+ if (F.isDeclaration())
+ GR->add(&F, &MIRBuilder.getMF(), FuncVReg);
auto *FTy = F.getFunctionType();
auto FuncTy = GR->assignTypeToVReg(FTy, FuncVReg, MIRBuilder);
@@ -136,6 +139,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
MIRBuilder.buildInstr(SPIRV::OpFunctionParameter)
.addDef(VRegs[i][0])
.addUse(ArgTypeVRegs[i]);
+ if (F.isDeclaration())
+ GR->add(F.getArg(i), &MIRBuilder.getMF(), VRegs[i][0]);
}
// Name the function.
if (F.hasName())
@@ -165,6 +170,7 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (Info.OrigRet.Regs.size() > 1)
return false;
+ GR->setCurrentFunc(MIRBuilder.getMF());
Register ResVReg =
Info.OrigRet.Regs.empty() ? Register(0) : Info.OrigRet.Regs[0];
// Emit a regular OpFunctionCall. If it's an externally declared function,
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
new file mode 100644
index 000000000000..57cd4bafd351
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
@@ -0,0 +1,95 @@
+//===-- SPIRVDuplicatesTracker.cpp - SPIR-V Duplicates Tracker --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// General infrastructure for keeping track of the values that according to
+// the SPIR-V binary layout should be global to the whole module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVDuplicatesTracker.h"
+
+using namespace llvm;
+
+template <typename T>
+void SPIRVGeneralDuplicatesTracker::prebuildReg2Entry(
+ SPIRVDuplicatesTracker<T> &DT, SPIRVReg2EntryTy &Reg2Entry) {
+ for (auto &TPair : DT.getAllUses()) {
+ for (auto &RegPair : TPair.second) {
+ const MachineFunction *MF = RegPair.first;
+ Register R = RegPair.second;
+ MachineInstr *MI = MF->getRegInfo().getUniqueVRegDef(R);
+ if (!MI)
+ continue;
+ Reg2Entry[&MI->getOperand(0)] = &TPair.second;
+ }
+ }
+}
+
+void SPIRVGeneralDuplicatesTracker::buildDepsGraph(
+ std::vector<SPIRV::DTSortableEntry *> &Graph,
+ MachineModuleInfo *MMI = nullptr) {
+ SPIRVReg2EntryTy Reg2Entry;
+ prebuildReg2Entry(TT, Reg2Entry);
+ prebuildReg2Entry(CT, Reg2Entry);
+ prebuildReg2Entry(GT, Reg2Entry);
+ prebuildReg2Entry(FT, Reg2Entry);
+ prebuildReg2Entry(AT, Reg2Entry);
+
+ for (auto &Op2E : Reg2Entry) {
+ SPIRV::DTSortableEntry *E = Op2E.second;
+ Graph.push_back(E);
+ for (auto &U : *E) {
+ const MachineRegisterInfo &MRI = U.first->getRegInfo();
+ MachineInstr *MI = MRI.getUniqueVRegDef(U.second);
+ if (!MI)
+ continue;
+ assert(MI && MI->getParent() && "No MachineInstr created yet");
+ for (auto i = MI->getNumDefs(); i < MI->getNumOperands(); i++) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg())
+ continue;
+ MachineOperand *RegOp = &MRI.getVRegDef(Op.getReg())->getOperand(0);
+ assert((MI->getOpcode() == SPIRV::OpVariable && i == 3) ||
+ Reg2Entry.count(RegOp));
+ if (Reg2Entry.count(RegOp))
+ E->addDep(Reg2Entry[RegOp]);
+ }
+
+ if (E->getIsFunc()) {
+ MachineInstr *Next = MI->getNextNode();
+ if (Next && (Next->getOpcode() == SPIRV::OpFunction ||
+ Next->getOpcode() == SPIRV::OpFunctionParameter)) {
+ E->addDep(Reg2Entry[&Next->getOperand(0)]);
+ }
+ }
+ }
+ }
+
+ if (MMI) {
+ const Module *M = MMI->getModule();
+ for (auto F = M->begin(), E = M->end(); F != E; ++F) {
+ const MachineFunction *MF = MMI->getMachineFunction(*F);
+ if (!MF)
+ continue;
+ for (const MachineBasicBlock &MBB : *MF) {
+ for (const MachineInstr &CMI : MBB) {
+ MachineInstr &MI = const_cast<MachineInstr &>(CMI);
+ MI.dump();
+ if (MI.getNumExplicitDefs() > 0 &&
+ Reg2Entry.count(&MI.getOperand(0))) {
+ dbgs() << "\t[";
+ for (SPIRV::DTSortableEntry *D :
+ Reg2Entry.lookup(&MI.getOperand(0))->getDeps())
+ dbgs() << Register::virtReg2Index(D->lookup(MF)) << ", ";
+ dbgs() << "]\n";
+ }
+ }
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
new file mode 100644
index 000000000000..58ae1f86ce42
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
@@ -0,0 +1,174 @@
+//===-- SPIRVDuplicatesTracker.h - SPIR-V Duplicates Tracker ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// General infrastructure for keeping track of the values that according to
+// the SPIR-V binary layout should be global to the whole module.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVDUPLICATESTRACKER_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVDUPLICATESTRACKER_H
+
+#include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "MCTargetDesc/SPIRVMCTargetDesc.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+
+#include <type_traits>
+
+namespace llvm {
+namespace SPIRV {
+// NOTE: using MapVector instead of DenseMap because it helps getting
+// everything ordered in a stable manner for a price of extra (NumKeys)*PtrSize
+// memory and expensive removals which do not happen anyway.
+class DTSortableEntry : public MapVector<const MachineFunction *, Register> {
+ SmallVector<DTSortableEntry *, 2> Deps;
+
+ struct FlagsTy {
+ unsigned IsFunc : 1;
+ unsigned IsGV : 1;
+ // NOTE: bit-field default init is a C++20 feature.
+ FlagsTy() : IsFunc(0), IsGV(0) {}
+ };
+ FlagsTy Flags;
+
+public:
+ // Common hoisting utility doesn't support function, because their hoisting
+ // require hoisting of params as well.
+ bool getIsFunc() const { return Flags.IsFunc; }
+ bool getIsGV() const { return Flags.IsGV; }
+ void setIsFunc(bool V) { Flags.IsFunc = V; }
+ void setIsGV(bool V) { Flags.IsGV = V; }
+
+ const SmallVector<DTSortableEntry *, 2> &getDeps() const { return Deps; }
+ void addDep(DTSortableEntry *E) { Deps.push_back(E); }
+};
+} // namespace SPIRV
+
+template <typename KeyTy> class SPIRVDuplicatesTrackerBase {
+public:
+ // NOTE: using MapVector instead of DenseMap helps getting everything ordered
+ // in a stable manner for a price of extra (NumKeys)*PtrSize memory and
+ // expensive removals which don't happen anyway.
+ using StorageTy = MapVector<KeyTy, SPIRV::DTSortableEntry>;
+
+private:
+ StorageTy Storage;
+
+public:
+ void add(KeyTy V, const MachineFunction *MF, Register R) {
+ if (find(V, MF).isValid())
+ return;
+
+ Storage[V][MF] = R;
+ if (std::is_same<Function,
+ typename std::remove_const<
+ typename std::remove_pointer<KeyTy>::type>::type>() ||
+ std::is_same<Argument,
+ typename std::remove_const<
+ typename std::remove_pointer<KeyTy>::type>::type>())
+ Storage[V].setIsFunc(true);
+ if (std::is_same<GlobalVariable,
+ typename std::remove_const<
+ typename std::remove_pointer<KeyTy>::type>::type>())
+ Storage[V].setIsGV(true);
+ }
+
+ Register find(KeyTy V, const MachineFunction *MF) const {
+ auto iter = Storage.find(V);
+ if (iter != Storage.end()) {
+ auto Map = iter->second;
+ auto iter2 = Map.find(MF);
+ if (iter2 != Map.end())
+ return iter2->second;
+ }
+ return Register();
+ }
+
+ const StorageTy &getAllUses() const { return Storage; }
+
+private:
+ StorageTy &getAllUses() { return Storage; }
+
+ // The friend class needs to have access to the internal storage
+ // to be able to build dependency graph, can't declare only one
+ // function a 'friend' due to the incomplete declaration at this point
+ // and mutual dependency problems.
+ friend class SPIRVGeneralDuplicatesTracker;
+};
+
+template <typename T>
+class SPIRVDuplicatesTracker : public SPIRVDuplicatesTrackerBase<const T *> {};
+
+class SPIRVGeneralDuplicatesTracker {
+ SPIRVDuplicatesTracker<Type> TT;
+ SPIRVDuplicatesTracker<Constant> CT;
+ SPIRVDuplicatesTracker<GlobalVariable> GT;
+ SPIRVDuplicatesTracker<Function> FT;
+ SPIRVDuplicatesTracker<Argument> AT;
+
+ // NOTE: using MOs instead of regs to get rid of MF dependency to be able
+ // to use flat data structure.
+ // NOTE: replacing DenseMap with MapVector doesn't affect overall correctness
+ // but makes LITs more stable, should prefer DenseMap still due to
+ // significant perf difference.
+ using SPIRVReg2EntryTy =
+ MapVector<MachineOperand *, SPIRV::DTSortableEntry *>;
+
+ template <typename T>
+ void prebuildReg2Entry(SPIRVDuplicatesTracker<T> &DT,
+ SPIRVReg2EntryTy &Reg2Entry);
+
+public:
+ void buildDepsGraph(std::vector<SPIRV::DTSortableEntry *> &Graph,
+ MachineModuleInfo *MMI);
+
+ void add(const Type *T, const MachineFunction *MF, Register R) {
+ TT.add(T, MF, R);
+ }
+
+ void add(const Constant *C, const MachineFunction *MF, Register R) {
+ CT.add(C, MF, R);
+ }
+
+ void add(const GlobalVariable *GV, const MachineFunction *MF, Register R) {
+ GT.add(GV, MF, R);
+ }
+
+ void add(const Function *F, const MachineFunction *MF, Register R) {
+ FT.add(F, MF, R);
+ }
+
+ void add(const Argument *Arg, const MachineFunction *MF, Register R) {
+ AT.add(Arg, MF, R);
+ }
+
+ Register find(const Type *T, const MachineFunction *MF) {
+ return TT.find(const_cast<Type *>(T), MF);
+ }
+
+ Register find(const Constant *C, const MachineFunction *MF) {
+ return CT.find(const_cast<Constant *>(C), MF);
+ }
+
+ Register find(const GlobalVariable *GV, const MachineFunction *MF) {
+ return GT.find(const_cast<GlobalVariable *>(GV), MF);
+ }
+
+ Register find(const Function *F, const MachineFunction *MF) {
+ return FT.find(const_cast<Function *>(F), MF);
+ }
+
+ Register find(const Argument *Arg, const MachineFunction *MF) {
+ return AT.find(const_cast<Argument *>(Arg), MF);
+ }
+};
+} // namespace llvm
+#endif \ No newline at end of file
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 02a6905a1abc..5f890c003cbc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -101,7 +101,6 @@ Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val,
SPIRVType *SpvType,
bool EmitIR) {
auto &MF = MIRBuilder.getMF();
- Register Res;
const IntegerType *LLVMIntTy;
if (SpvType)
LLVMIntTy = cast<IntegerType>(getTypeForSPIRVType(SpvType));
@@ -110,15 +109,18 @@ Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val,
// Find a constant in DT or build a new one.
const auto ConstInt =
ConstantInt::get(const_cast<IntegerType *>(LLVMIntTy), Val);
- unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32;
- Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth));
- assignTypeToVReg(LLVMIntTy, Res, MIRBuilder);
- if (EmitIR)
- MIRBuilder.buildConstant(Res, *ConstInt);
- else
- MIRBuilder.buildInstr(SPIRV::OpConstantI)
- .addDef(Res)
- .addImm(ConstInt->getSExtValue());
+ Register Res = DT.find(ConstInt, &MF);
+ if (!Res.isValid()) {
+ unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32;
+ Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth));
+ assignTypeToVReg(LLVMIntTy, Res, MIRBuilder);
+ if (EmitIR)
+ MIRBuilder.buildConstant(Res, *ConstInt);
+ else
+ MIRBuilder.buildInstr(SPIRV::OpConstantI)
+ .addDef(Res)
+ .addImm(ConstInt->getSExtValue());
+ }
return Res;
}
@@ -126,7 +128,6 @@ Register SPIRVGlobalRegistry::buildConstantFP(APFloat Val,
MachineIRBuilder &MIRBuilder,
SPIRVType *SpvType) {
auto &MF = MIRBuilder.getMF();
- Register Res;
const Type *LLVMFPTy;
if (SpvType) {
LLVMFPTy = getTypeForSPIRVType(SpvType);
@@ -136,10 +137,13 @@ Register SPIRVGlobalRegistry::buildConstantFP(APFloat Val,
}
// Find a constant in DT or build a new one.
const auto ConstFP = ConstantFP::get(LLVMFPTy->getContext(), Val);
- unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32;
- Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth));
- assignTypeToVReg(LLVMFPTy, Res, MIRBuilder);
- MIRBuilder.buildFConstant(Res, *ConstFP);
+ Register Res = DT.find(ConstFP, &MF);
+ if (!Res.isValid()) {
+ unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32;
+ Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth));
+ assignTypeToVReg(LLVMFPTy, Res, MIRBuilder);
+ MIRBuilder.buildFConstant(Res, *ConstFP);
+ }
return Res;
}
@@ -184,6 +188,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
*Subtarget.getRegBankInfo());
}
Reg = MIB->getOperand(0).getReg();
+ DT.add(GVar, &MIRBuilder.getMF(), Reg);
// Set to Reg the same type as ResVReg has.
auto MRI = MIRBuilder.getMRI();
@@ -318,10 +323,11 @@ SPIRVType *SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg) const {
SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(
const Type *Type, MachineIRBuilder &MIRBuilder,
SPIRV::AccessQualifier AccessQual, bool EmitIR) {
+ Register Reg = DT.find(Type, &MIRBuilder.getMF());
+ if (Reg.isValid())
+ return getSPIRVTypeForVReg(Reg);
SPIRVType *SpirvType = createSPIRVType(Type, MIRBuilder, AccessQual, EmitIR);
- VRegToTypeMap[&MIRBuilder.getMF()][getSPIRVTypeID(SpirvType)] = SpirvType;
- SPIRVToLLVMType[SpirvType] = Type;
- return SpirvType;
+ return restOfCreateSPIRVType(Type, SpirvType);
}
bool SPIRVGlobalRegistry::isScalarOfType(Register VReg,
@@ -387,17 +393,21 @@ SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(unsigned BitWidth,
MIRBuilder);
}
-SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(Type *LLVMTy,
- MachineInstrBuilder MIB) {
- SPIRVType *SpirvType = MIB;
+SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(const Type *LLVMTy,
+ SPIRVType *SpirvType) {
+ assert(CurMF == SpirvType->getMF());
VRegToTypeMap[CurMF][getSPIRVTypeID(SpirvType)] = SpirvType;
SPIRVToLLVMType[SpirvType] = LLVMTy;
+ DT.add(LLVMTy, CurMF, getSPIRVTypeID(SpirvType));
return SpirvType;
}
SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(
unsigned BitWidth, MachineInstr &I, const SPIRVInstrInfo &TII) {
Type *LLVMTy = IntegerType::get(CurMF->getFunction().getContext(), BitWidth);
+ Register Reg = DT.find(LLVMTy, CurMF);
+ if (Reg.isValid())
+ return getSPIRVTypeForVReg(Reg);
MachineBasicBlock &BB = *I.getParent();
auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeInt))
.addDef(createTypeVReg(CurMF->getRegInfo()))
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index 952ab4c13e29..13dcc20a3e0a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -17,6 +17,7 @@
#define LLVM_LIB_TARGET_SPIRV_SPIRVTYPEMANAGER_H
#include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "SPIRVDuplicatesTracker.h"
#include "SPIRVInstrInfo.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -30,7 +31,10 @@ class SPIRVGlobalRegistry {
// where Reg = OpType...
// while VRegToTypeMap tracks SPIR-V type assigned to other regs (i.e. not
// type-declaring ones)
- DenseMap<MachineFunction *, DenseMap<Register, SPIRVType *>> VRegToTypeMap;
+ DenseMap<const MachineFunction *, DenseMap<Register, SPIRVType *>>
+ VRegToTypeMap;
+
+ SPIRVGeneralDuplicatesTracker DT;
DenseMap<SPIRVType *, const Type *> SPIRVToLLVMType;
@@ -48,6 +52,39 @@ public:
MachineFunction *CurMF;
+ void add(const Constant *C, MachineFunction *MF, Register R) {
+ DT.add(C, MF, R);
+ }
+
+ void add(const GlobalVariable *GV, MachineFunction *MF, Register R) {
+ DT.add(GV, MF, R);
+ }
+
+ void add(const Function *F, MachineFunction *MF, Register R) {
+ DT.add(F, MF, R);
+ }
+
+ void add(const Argument *Arg, MachineFunction *MF, Register R) {
+ DT.add(Arg, MF, R);
+ }
+
+ Register find(const Constant *C, MachineFunction *MF) {
+ return DT.find(C, MF);
+ }
+
+ Register find(const GlobalVariable *GV, MachineFunction *MF) {
+ return DT.find(GV, MF);
+ }
+
+ Register find(const Function *F, MachineFunction *MF) {
+ return DT.find(F, MF);
+ }
+
+ void buildDepsGraph(std::vector<SPIRV::DTSortableEntry *> &Graph,
+ MachineModuleInfo *MMI = nullptr) {
+ DT.buildDepsGraph(Graph, MMI);
+ }
+
// Get or create a SPIR-V type corresponding the given LLVM IR type,
// and map it to the given VReg by creating an ASSIGN_TYPE instruction.
SPIRVType *assignTypeToVReg(
@@ -136,7 +173,7 @@ private:
SPIRVType *getOpTypeFunction(SPIRVType *RetType,
const SmallVectorImpl<SPIRVType *> &ArgTypes,
MachineIRBuilder &MIRBuilder);
- SPIRVType *restOfCreateSPIRVType(Type *LLVMTy, MachineInstrBuilder MIB);
+ SPIRVType *restOfCreateSPIRVType(const Type *LLVMTy, SPIRVType *SpirvType);
public:
Register buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder,
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 9294a60506a8..90b921a06f21 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -807,23 +807,29 @@ void SPIRVInstructionSelector::renderImm32(MachineInstrBuilder &MIB,
Register
SPIRVInstructionSelector::buildI32Constant(uint32_t Val, MachineInstr &I,
const SPIRVType *ResType) const {
+ Type *LLVMTy = IntegerType::get(GR.CurMF->getFunction().getContext(), 32);
const SPIRVType *SpvI32Ty =
ResType ? ResType : GR.getOrCreateSPIRVIntegerType(32, I, TII);
- Register NewReg;
- NewReg = MRI->createGenericVirtualRegister(LLT::scalar(32));
- MachineInstr *MI;
- MachineBasicBlock &BB = *I.getParent();
- if (Val == 0) {
- MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
- .addDef(NewReg)
- .addUse(GR.getSPIRVTypeID(SpvI32Ty));
- } else {
- MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI))
- .addDef(NewReg)
- .addUse(GR.getSPIRVTypeID(SpvI32Ty))
- .addImm(APInt(32, Val).getZExtValue());
+ // Find a constant in DT or build a new one.
+ auto ConstInt = ConstantInt::get(LLVMTy, Val);
+ Register NewReg = GR.find(ConstInt, GR.CurMF);
+ if (!NewReg.isValid()) {
+ NewReg = MRI->createGenericVirtualRegister(LLT::scalar(32));
+ GR.add(ConstInt, GR.CurMF, NewReg);
+ MachineInstr *MI;
+ MachineBasicBlock &BB = *I.getParent();
+ if (Val == 0) {
+ MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
+ .addDef(NewReg)
+ .addUse(GR.getSPIRVTypeID(SpvI32Ty));
+ } else {
+ MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI))
+ .addDef(NewReg)
+ .addUse(GR.getSPIRVTypeID(SpvI32Ty))
+ .addImm(APInt(32, Val).getZExtValue());
+ }
+ constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
}
- constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
return NewReg;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index fa78dd7942c6..a39df5234935 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -28,6 +28,11 @@ using namespace llvm;
#define DEBUG_TYPE "spirv-module-analysis"
+static cl::opt<bool>
+ SPVDumpDeps("spv-dump-deps",
+ cl::desc("Dump MIR with SPIR-V dependencies info"),
+ cl::Optional, cl::init(false));
+
char llvm::SPIRVModuleAnalysis::ID = 0;
namespace llvm {
@@ -113,6 +118,83 @@ static bool findSameInstrInMS(const MachineInstr &A,
return false;
}
+// Collect MI which defines the register in the given machine function.
+static void collectDefInstr(Register Reg, const MachineFunction *MF,
+ SPIRV::ModuleAnalysisInfo *MAI,
+ SPIRV::ModuleSectionType MSType,
+ bool DoInsert = true) {
+ assert(MAI->hasRegisterAlias(MF, Reg) && "Cannot find register alias");
+ MachineInstr *MI = MF->getRegInfo().getUniqueVRegDef(Reg);
+ assert(MI && "There should be an instruction that defines the register");
+ MAI->setSkipEmission(MI);
+ if (DoInsert)
+ MAI->MS[MSType].push_back(MI);
+}
+
+void SPIRVModuleAnalysis::collectGlobalEntities(
+ const std::vector<SPIRV::DTSortableEntry *> &DepsGraph,
+ SPIRV::ModuleSectionType MSType,
+ std::function<bool(const SPIRV::DTSortableEntry *)> Pred,
+ bool UsePreOrder) {
+ DenseSet<const SPIRV::DTSortableEntry *> Visited;
+ for (const auto *E : DepsGraph) {
+ std::function<void(const SPIRV::DTSortableEntry *)> RecHoistUtil;
+ // NOTE: here we prefer recursive approach over iterative because
+ // we don't expect depchains long enough to cause SO.
+ RecHoistUtil = [MSType, UsePreOrder, &Visited, &Pred,
+ &RecHoistUtil](const SPIRV::DTSortableEntry *E) {
+ if (Visited.count(E) || !Pred(E))
+ return;
+ Visited.insert(E);
+
+ // Traversing deps graph in post-order allows us to get rid of
+ // register aliases preprocessing.
+ // But pre-order is required for correct processing of function
+ // declaration and arguments processing.
+ if (!UsePreOrder)
+ for (auto *S : E->getDeps())
+ RecHoistUtil(S);
+
+ Register GlobalReg = Register::index2VirtReg(MAI.getNextID());
+ bool IsFirst = true;
+ for (auto &U : *E) {
+ const MachineFunction *MF = U.first;
+ Register Reg = U.second;
+ MAI.setRegisterAlias(MF, Reg, GlobalReg);
+ if (!MF->getRegInfo().getUniqueVRegDef(Reg))
+ continue;
+ collectDefInstr(Reg, MF, &MAI, MSType, IsFirst);
+ IsFirst = false;
+ if (E->getIsGV())
+ MAI.GlobalVarList.push_back(MF->getRegInfo().getUniqueVRegDef(Reg));
+ }
+
+ if (UsePreOrder)
+ for (auto *S : E->getDeps())
+ RecHoistUtil(S);
+ };
+ RecHoistUtil(E);
+ }
+}
+
+// The function initializes global register alias table for types, consts,
+// global vars and func decls and collects these instruction for output
+// at module level. Also it collects explicit OpExtension/OpCapability
+// instructions.
+void SPIRVModuleAnalysis::processDefInstrs(const Module &M) {
+ std::vector<SPIRV::DTSortableEntry *> DepsGraph;
+
+ GR->buildDepsGraph(DepsGraph, SPVDumpDeps ? MMI : nullptr);
+
+ collectGlobalEntities(
+ DepsGraph, SPIRV::MB_TypeConstVars,
+ [](const SPIRV::DTSortableEntry *E) { return !E->getIsFunc(); }, false);
+
+ collectGlobalEntities(
+ DepsGraph, SPIRV::MB_ExtFuncDecls,
+ [](const SPIRV::DTSortableEntry *E) { return E->getIsFunc(); }, true);
+}
+
// Look for IDs declared with Import linkage, and map the imported name string
// to the register defining that variable (which will usually be the result of
// an OpFunction). This lets us call externally imported functions using
@@ -146,10 +228,9 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI,
// numbering has already occurred by this point. We can directly compare reg
// arguments when detecting duplicates.
static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
- SPIRV::ModuleSectionType MSType,
- bool IsConstOrType = false) {
+ SPIRV::ModuleSectionType MSType) {
MAI.setSkipEmission(&MI);
- if (findSameInstrInMS(MI, MSType, MAI, IsConstOrType, IsConstOrType ? 1 : 0))
+ if (findSameInstrInMS(MI, MSType, MAI, false))
return; // Found a duplicate, so don't add it.
// No duplicates, so add it.
MAI.MS[MSType].push_back(&MI);
@@ -163,18 +244,11 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
continue;
MachineFunction *MF = MMI->getMachineFunction(*F);
assert(MF);
- unsigned FCounter = 0;
for (MachineBasicBlock &MBB : *MF)
for (MachineInstr &MI : MBB) {
- if (MI.getOpcode() == SPIRV::OpFunction)
- FCounter++;
if (MAI.getSkipEmission(&MI))
continue;
const unsigned OpCode = MI.getOpcode();
- const bool IsFuncOrParm =
- OpCode == SPIRV::OpFunction || OpCode == SPIRV::OpFunctionParameter;
- const bool IsConstOrType =
- TII->isConstantInstr(MI) || TII->isTypeDeclInstr(MI);
if (OpCode == SPIRV::OpName || OpCode == SPIRV::OpMemberName) {
collectOtherInstr(MI, MAI, SPIRV::MB_DebugNames);
} else if (OpCode == SPIRV::OpEntryPoint) {
@@ -182,12 +256,6 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
} else if (TII->isDecorationInstr(MI)) {
collectOtherInstr(MI, MAI, SPIRV::MB_Annotations);
collectFuncNames(MI, *F);
- } else if (IsConstOrType || (FCounter > 1 && IsFuncOrParm)) {
- // Now OpSpecConstant*s are not in DT,
- // but they need to be collected anyway.
- enum SPIRV::ModuleSectionType Type =
- IsFuncOrParm ? SPIRV::MB_ExtFuncDecls : SPIRV::MB_TypeConstVars;
- collectOtherInstr(MI, MAI, Type, IsConstOrType);
} else if (OpCode == SPIRV::OpFunction) {
collectFuncNames(MI, *F);
}
@@ -239,6 +307,7 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) {
// TODO: Process type/const/global var/func decl instructions, number their
// destination registers from 0 to N, collect Extensions and Capabilities.
+ processDefInstrs(M);
// Number rest of registers from N+1 onwards.
numberRegistersGlobally(M);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index 1bef13d458c1..585868909d28 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -15,6 +15,7 @@
#define LLVM_LIB_TARGET_SPIRV_SPIRVMODULEANALYSIS_H
#include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "SPIRVDuplicatesTracker.h"
#include "SPIRVSubtarget.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
@@ -123,6 +124,11 @@ public:
private:
void setBaseInfo(const Module &M);
template <typename T> void collectTypesConstsVars();
+ void collectGlobalEntities(
+ const std::vector<SPIRV::DTSortableEntry *> &DepsGraph,
+ SPIRV::ModuleSectionType MSType,
+ std::function<bool(const SPIRV::DTSortableEntry *)> Pred,
+ bool UsePreOrder);
void processDefInstrs(const Module &M);
void collectFuncNames(MachineInstr &MI, const Function &F);
void processOtherInstrs(const Module &M);
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index d75d41b35838..ee460002fc58 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -44,12 +44,11 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
namespace {
class SparcMCCodeEmitter : public MCCodeEmitter {
- const MCInstrInfo &MCII;
MCContext &Ctx;
public:
- SparcMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
- : MCII(mcii), Ctx(ctx) {}
+ SparcMCCodeEmitter(const MCInstrInfo &, MCContext &ctx)
+ : Ctx(ctx) {}
SparcMCCodeEmitter(const SparcMCCodeEmitter &) = delete;
SparcMCCodeEmitter &operator=(const SparcMCCodeEmitter &) = delete;
~SparcMCCodeEmitter() override = default;
@@ -84,12 +83,6 @@ public:
unsigned getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
-
-private:
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
};
} // end anonymous namespace
@@ -97,9 +90,6 @@ private:
void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
- verifyInstructionPredicates(MI,
- computeAvailableFeatures(STI.getFeatureBits()));
-
unsigned Bits = getBinaryCodeForInstr(MI, Fixups, STI);
support::endian::write(OS, Bits,
Ctx.getAsmInfo()->isLittleEndian() ? support::little
@@ -253,7 +243,6 @@ getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
return 0;
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "SparcGenMCCodeEmitter.inc"
MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII,
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index 49b75b7e0bd1..b11c786e7856 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -24,6 +24,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "SparcGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index 7ef043d9df40..8e6a9ebdb2dd 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -46,6 +46,7 @@ std::unique_ptr<MCObjectTargetWriter> createSparcELFObjectWriter(bool Is64Bit,
// Defines symbolic names for the Sparc instructions.
//
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "SparcGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index f6f9c0a1de81..c8961d507c72 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -250,6 +250,8 @@ void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
}
void SparcAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ Sparc_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
switch (MI->getOpcode()) {
default: break;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index 242f566da2c9..1a71ff28424f 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -150,23 +150,13 @@ private:
return getPCRelEncoding(MI, OpNum, Fixups,
SystemZ::FK_390_PC24DBL, 3, false);
}
-
-private:
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
};
} // end anonymous namespace
-void SystemZMCCodeEmitter::
-encodeInstruction(const MCInst &MI, raw_ostream &OS,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- verifyInstructionPredicates(MI,
- computeAvailableFeatures(STI.getFeatureBits()));
-
+void SystemZMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
MemOpsEmitted = 0;
uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
unsigned Size = MCII.get(MI.getOpcode()).getSize();
@@ -329,7 +319,6 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
return 0;
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "SystemZGenMCCodeEmitter.inc"
MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 03141ecf551d..08886507fdb7 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -23,6 +23,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "SystemZGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index db4485423416..f2bfc9ac48e5 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -95,6 +95,7 @@ std::unique_ptr<MCObjectTargetWriter> createSystemZObjectWriter(uint8_t OSABI);
// Defines symbolic names for the SystemZ instructions.
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "SystemZGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 6fb080607f51..1d55bf9a5804 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -143,6 +143,9 @@ void SystemZAsmPrinter::emitCallInformation(CallType CT) {
}
void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ SystemZ_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
SystemZMCInstLower Lower(MF->getContext(), *this);
MCInst LoweredMI;
switch (MI->getOpcode()) {
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index a7ea5e1e4bf8..fdd82a01f211 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -162,11 +162,7 @@ def CSR_SystemZ_NoRegs : CalleeSavedRegs<(add)>;
//===----------------------------------------------------------------------===//
// z/OS XPLINK64 callee-saved registers
//===----------------------------------------------------------------------===//
-// %R7D is volatile by the spec, but it must be saved in the prologue by
-// any non-leaf function and restored in the epilogue for use by the
-// return instruction so it functions exactly like a callee-saved register.
-def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 7, 15),
- (sequence "R%dD", 4, 4),
+def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 8, 15),
(sequence "F%dD", 15, 8))>;
def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add CSR_SystemZ_XPLINK64,
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 43bc7426cfa8..975eb8862e82 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -918,72 +918,74 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots(
SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+ auto &GRRegClass = SystemZ::GR64BitRegClass;
+
+ // For non-leaf functions:
+ // - the address of callee (entry point) register R6 must be saved
+ CSI.push_back(CalleeSavedInfo(Regs.getAddressOfCalleeRegister()));
+ CSI.back().setRestored(false);
+
+ // The return address register R7 must be saved and restored.
+ CSI.push_back(CalleeSavedInfo(Regs.getReturnFunctionAddressRegister()));
+
+ // If the function needs a frame pointer, or if the backchain pointer should
+ // be stored, then save the stack pointer register R4.
+ if (hasFP(MF) || MF.getFunction().hasFnAttribute("backchain"))
+ CSI.push_back(CalleeSavedInfo(Regs.getStackPointerRegister()));
// Scan the call-saved GPRs and find the bounds of the register spill area.
- unsigned LowGPR = 0;
- int LowOffset = INT32_MAX;
- unsigned HighGPR = LowGPR;
+ Register LowRestoreGPR = 0;
+ int LowRestoreOffset = INT32_MAX;
+ Register LowSpillGPR = 0;
+ int LowSpillOffset = INT32_MAX;
+ Register HighGPR = 0;
int HighOffset = -1;
- unsigned RegSP = Regs.getStackPointerRegister();
- auto &GRRegClass = SystemZ::GR64BitRegClass;
- const unsigned RegSize = 8;
+ for (auto &CS : CSI) {
+ Register Reg = CS.getReg();
+ int Offset = RegSpillOffsets[Reg];
+ if (Offset >= 0) {
+ if (GRRegClass.contains(Reg)) {
+ if (LowSpillOffset > Offset) {
+ LowSpillOffset = Offset;
+ LowSpillGPR = Reg;
+ }
+ if (CS.isRestored() && LowRestoreOffset > Offset) {
+ LowRestoreOffset = Offset;
+ LowRestoreGPR = Reg;
+ }
- auto ProcessCSI = [&](std::vector<CalleeSavedInfo> &CSIList) {
- for (auto &CS : CSIList) {
- Register Reg = CS.getReg();
- int Offset = RegSpillOffsets[Reg];
- if (Offset >= 0) {
- if (GRRegClass.contains(Reg)) {
- if (LowOffset > Offset) {
- LowOffset = Offset;
- LowGPR = Reg;
- }
-
- if (Offset > HighOffset) {
- HighOffset = Offset;
- HighGPR = Reg;
- }
+ if (Offset > HighOffset) {
+ HighOffset = Offset;
+ HighGPR = Reg;
}
+ // Non-volatile GPRs are saved in the dedicated register save area at
+ // the bottom of the stack and are not truly part of the "normal" stack
+ // frame. Mark the frame index as NoAlloc to indicate it as such.
+ unsigned RegSize = 8;
int FrameIdx = MFFrame.CreateFixedSpillStackObject(RegSize, Offset);
CS.setFrameIdx(FrameIdx);
- } else
- CS.setFrameIdx(INT32_MAX);
+ MFFrame.setStackID(FrameIdx, TargetStackID::NoAlloc);
+ }
+ } else {
+ Register Reg = CS.getReg();
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ Align Alignment = TRI->getSpillAlign(*RC);
+ unsigned Size = TRI->getSpillSize(*RC);
+ Alignment = std::min(Alignment, getStackAlign());
+ int FrameIdx = MFFrame.CreateStackObject(Size, Alignment, true);
+ CS.setFrameIdx(FrameIdx);
}
- };
-
- std::vector<CalleeSavedInfo> Spills;
-
- // For non-leaf functions:
- // - the address of callee (entry point) register R6 must be saved
- Spills.push_back(CalleeSavedInfo(Regs.getAddressOfCalleeRegister()));
-
- // If the function needs a frame pointer, or if the backchain pointer should
- // be stored, then save the stack pointer register R4.
- if (hasFP(MF) || MF.getFunction().hasFnAttribute("backchain"))
- Spills.push_back(CalleeSavedInfo(RegSP));
+ }
// Save the range of call-saved registers, for use by the
// prologue/epilogue inserters.
- ProcessCSI(CSI);
- MFI->setRestoreGPRRegs(LowGPR, HighGPR, LowOffset);
+ if (LowRestoreGPR)
+ MFI->setRestoreGPRRegs(LowRestoreGPR, HighGPR, LowRestoreOffset);
// Save the range of call-saved registers, for use by the epilogue inserter.
- ProcessCSI(Spills);
- MFI->setSpillGPRRegs(LowGPR, HighGPR, LowOffset);
-
- // Create spill slots for the remaining registers.
- for (auto &CS : CSI) {
- if (CS.getFrameIdx() != INT32_MAX)
- continue;
- Register Reg = CS.getReg();
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- Align Alignment = TRI->getSpillAlign(*RC);
- unsigned Size = TRI->getSpillSize(*RC);
- Alignment = std::min(Alignment, getStackAlign());
- int FrameIdx = MFFrame.CreateStackObject(Size, Alignment, true);
- CS.setFrameIdx(FrameIdx);
- }
+ assert(LowSpillGPR && "Expected registers to spill");
+ MFI->setSpillGPRRegs(LowSpillGPR, HighGPR, LowSpillOffset);
return true;
}
@@ -1001,13 +1003,6 @@ void SystemZXPLINKFrameLowering::determineCalleeSaves(MachineFunction &MF,
// frame pointer will be clobbered.
if (HasFP)
SavedRegs.set(Regs.getFramePointerRegister());
-
- // If the function is not an XPLeaf function, we need to save the
- // return address register. We also always use that register for
- // the return instruction, so it needs to be restored in the
- // epilogue even though that register is considered to be volatile.
- // #TODO: Implement leaf detection.
- SavedRegs.set(Regs.getReturnFunctionAddressRegister());
}
bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters(
diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 8f633adbb9ef..29cc2840310d 100644
--- a/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -240,6 +240,13 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
return SectionKind::getBSS();
}
+ // Global variables with '!exclude' should get the exclude section kind if
+ // they have an explicit section and no other metadata.
+ if (GVar->hasSection())
+ if (MDNode *MD = GVar->getMetadata(LLVMContext::MD_exclude))
+ if (!MD->getNumOperands())
+ return SectionKind::getExclude();
+
// If the global is marked constant, we can put it into a mergable section,
// a mergable string section, or general .data if it contains relocations.
if (GVar->isConstant()) {
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
index 3eb246f73679..45facd34f84e 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
@@ -39,12 +39,11 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
namespace {
class VEMCCodeEmitter : public MCCodeEmitter {
- const MCInstrInfo &MCII;
MCContext &Ctx;
public:
- VEMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
- : MCII(mcii), Ctx(ctx) {}
+ VEMCCodeEmitter(const MCInstrInfo &, MCContext &ctx)
+ : Ctx(ctx) {}
VEMCCodeEmitter(const VEMCCodeEmitter &) = delete;
VEMCCodeEmitter &operator=(const VEMCCodeEmitter &) = delete;
~VEMCCodeEmitter() override = default;
@@ -74,12 +73,6 @@ public:
uint64_t getRDOpValue(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
-
-private:
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
};
} // end anonymous namespace
@@ -87,9 +80,6 @@ private:
void VEMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
- verifyInstructionPredicates(MI,
- computeAvailableFeatures(STI.getFeatureBits()));
-
uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
support::endian::write<uint64_t>(OS, Bits, support::little);
@@ -155,7 +145,6 @@ uint64_t VEMCCodeEmitter::getRDOpValue(const MCInst &MI, unsigned OpNo,
return 0;
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "VEGenMCCodeEmitter.inc"
MCCodeEmitter *llvm::createVEMCCodeEmitter(const MCInstrInfo &MCII,
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
index f4fbf763e59c..5a562d77f941 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
@@ -24,6 +24,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "VEGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
index d8f9d0634c24..935a0bfc0c4c 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
@@ -44,6 +44,7 @@ std::unique_ptr<MCObjectTargetWriter> createVEELFObjectWriter(uint8_t OSABI);
// Defines symbolic names for the VE instructions.
//
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "VEGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp
index af69d04a17ca..5553087d6f47 100644
--- a/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -325,6 +325,8 @@ void VEAsmPrinter::lowerGETTLSAddrAndEmitMCInsts(const MachineInstr *MI,
}
void VEAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ VE_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
switch (MI->getOpcode()) {
default:
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index 85285749b4fa..e54453b31354 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -325,22 +325,22 @@ def VEMEMziiAsmOperand : AsmOperandClass {
// ASX format uses single assembly instruction format.
def MEMrri : Operand<iPTR> {
let PrintMethod = "printMemASXOperand";
- let MIOperandInfo = (ops ptr_rc, ptr_rc, i32imm);
+ let MIOperandInfo = (ops ptr_rc, ptr_rc, i64imm);
let ParserMatchClass = VEMEMrriAsmOperand;
}
def MEMrii : Operand<iPTR> {
let PrintMethod = "printMemASXOperand";
- let MIOperandInfo = (ops ptr_rc, i32imm, i32imm);
+ let MIOperandInfo = (ops ptr_rc, i32imm, i64imm);
let ParserMatchClass = VEMEMriiAsmOperand;
}
def MEMzri : Operand<iPTR> {
let PrintMethod = "printMemASXOperand";
- let MIOperandInfo = (ops i32imm /* = 0 */, ptr_rc, i32imm);
+ let MIOperandInfo = (ops i32imm /* = 0 */, ptr_rc, i64imm);
let ParserMatchClass = VEMEMzriAsmOperand;
}
def MEMzii : Operand<iPTR> {
let PrintMethod = "printMemASXOperand";
- let MIOperandInfo = (ops i32imm /* = 0 */, i32imm, i32imm);
+ let MIOperandInfo = (ops i32imm /* = 0 */, i32imm, i64imm);
let ParserMatchClass = VEMEMziiAsmOperand;
}
diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp
index d175ad26c742..f334af128162 100644
--- a/llvm/lib/Target/VE/VERegisterInfo.cpp
+++ b/llvm/lib/Target/VE/VERegisterInfo.cpp
@@ -27,6 +27,8 @@
using namespace llvm;
+#define DEBUG_TYPE "ve-register-info"
+
#define GET_REGINFO_TARGET_DESC
#include "VEGenRegisterInfo.inc"
@@ -133,66 +135,179 @@ static unsigned offsetToDisp(MachineInstr &MI) {
return OffDisp;
}
-static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II,
- MachineInstr &MI, const DebugLoc &dl,
- unsigned FIOperandNum, int Offset, Register FrameReg) {
- // Replace frame index with a frame pointer reference directly.
- // VE has 32 bit offset field, so no need to expand a target instruction.
- // Directly encode it.
+class EliminateFrameIndex {
+ const TargetInstrInfo &TII;
+ const TargetRegisterInfo &TRI;
+ const DebugLoc &DL;
+ MachineBasicBlock &MBB;
+ MachineBasicBlock::iterator II;
+ Register clobber;
+
+ // Some helper functions for the ease of instruction building.
+ MachineFunction &getFunc() const { return *MBB.getParent(); }
+ inline MCRegister getSubReg(MCRegister Reg, unsigned Idx) const {
+ return TRI.getSubReg(Reg, Idx);
+ }
+ inline const MCInstrDesc &get(unsigned Opcode) const {
+ return TII.get(Opcode);
+ }
+ inline MachineInstrBuilder build(const MCInstrDesc &MCID, Register DestReg) {
+ return BuildMI(MBB, II, DL, MCID, DestReg);
+ }
+ inline MachineInstrBuilder build(unsigned InstOpc, Register DestReg) {
+ return build(get(InstOpc), DestReg);
+ }
+ inline MachineInstrBuilder build(const MCInstrDesc &MCID) {
+ return BuildMI(MBB, II, DL, MCID);
+ }
+ inline MachineInstrBuilder build(unsigned InstOpc) {
+ return build(get(InstOpc));
+ }
+
+ // Calculate an address of frame index from a frame register and a given
+ // offset if the offset doesn't fit in the immediate field. Use a clobber
+ // register to hold calculated address.
+ void prepareReplaceFI(MachineInstr &MI, Register &FrameReg, int64_t &Offset,
+ int64_t Bytes = 0);
+ // Replace the frame index in \p MI with a frame register and a given offset
+ // if it fits in the immediate field. Otherwise, use pre-calculated address
+ // in a clobber regsiter.
+ void replaceFI(MachineInstr &MI, Register FrameReg, int64_t Offset,
+ int FIOperandNum);
+
+ // Expand and eliminate Frame Index of pseudo STQrii and LDQrii.
+ void processSTQ(MachineInstr &MI, Register FrameReg, int64_t Offset,
+ int FIOperandNum);
+ void processLDQ(MachineInstr &MI, Register FrameReg, int64_t Offset,
+ int FIOperandNum);
+
+public:
+ EliminateFrameIndex(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+ const DebugLoc &DL, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator II)
+ : TII(TII), TRI(TRI), DL(DL), MBB(MBB), II(II), clobber(VE::SX13) {}
+
+ // Expand and eliminate Frame Index from MI
+ void processMI(MachineInstr &MI, Register FrameReg, int64_t Offset,
+ int FIOperandNum);
+};
+
+// Prepare the frame index if it doesn't fit in the immediate field. Use
+// clobber register to hold calculated address.
+void EliminateFrameIndex::prepareReplaceFI(MachineInstr &MI, Register &FrameReg,
+ int64_t &Offset, int64_t Bytes) {
+ if (isInt<32>(Offset) && isInt<32>(Offset + Bytes)) {
+ // If the offset is small enough to fit in the immediate field, directly
+ // encode it. So, nothing to prepare here.
+ return;
+ }
+
+ // If the offset doesn't fit, emit following codes. This clobbers SX13
+ // which we always know is available here.
+ // lea %clobber, Offset@lo
+ // and %clobber, %clobber, (32)0
+ // lea.sl %clobber, Offset@hi(FrameReg, %clobber)
+ build(VE::LEAzii, clobber).addImm(0).addImm(0).addImm(Lo_32(Offset));
+ build(VE::ANDrm, clobber).addReg(clobber).addImm(M0(32));
+ build(VE::LEASLrri, clobber)
+ .addReg(clobber)
+ .addReg(FrameReg)
+ .addImm(Hi_32(Offset));
+
+ // Use clobber register as a frame register and 0 offset
+ FrameReg = clobber;
+ Offset = 0;
+}
+
+// Replace the frame index in \p MI with a proper byte and framereg offset.
+void EliminateFrameIndex::replaceFI(MachineInstr &MI, Register FrameReg,
+ int64_t Offset, int FIOperandNum) {
+ assert(isInt<32>(Offset));
+
+ // The offset must be small enough to fit in the immediate field after
+ // call of prepareReplaceFI. Therefore, we directly encode it.
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
MI.getOperand(FIOperandNum + offsetToDisp(MI)).ChangeToImmediate(Offset);
}
+void EliminateFrameIndex::processSTQ(MachineInstr &MI, Register FrameReg,
+ int64_t Offset, int FIOperandNum) {
+ assert(MI.getOpcode() == VE::STQrii);
+ LLVM_DEBUG(dbgs() << "processSTQ: "; MI.dump());
+
+ prepareReplaceFI(MI, FrameReg, Offset, 8);
+
+ Register SrcReg = MI.getOperand(3).getReg();
+ Register SrcHiReg = getSubReg(SrcReg, VE::sub_even);
+ Register SrcLoReg = getSubReg(SrcReg, VE::sub_odd);
+ // VE stores HiReg to 8(addr) and LoReg to 0(addr)
+ MachineInstr *StMI =
+ build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg(SrcLoReg);
+ replaceFI(*StMI, FrameReg, Offset, 0);
+ // Mutate to 'hi' store.
+ MI.setDesc(get(VE::STrii));
+ MI.getOperand(3).setReg(SrcHiReg);
+ Offset += 8;
+ replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
+void EliminateFrameIndex::processLDQ(MachineInstr &MI, Register FrameReg,
+ int64_t Offset, int FIOperandNum) {
+ assert(MI.getOpcode() == VE::LDQrii);
+ LLVM_DEBUG(dbgs() << "processLDQ: "; MI.dump());
+
+ prepareReplaceFI(MI, FrameReg, Offset, 8);
+
+ Register DestReg = MI.getOperand(0).getReg();
+ Register DestHiReg = getSubReg(DestReg, VE::sub_even);
+ Register DestLoReg = getSubReg(DestReg, VE::sub_odd);
+ // VE loads HiReg from 8(addr) and LoReg from 0(addr)
+ MachineInstr *StMI =
+ build(VE::LDrii, DestLoReg).addReg(FrameReg).addImm(0).addImm(0);
+ replaceFI(*StMI, FrameReg, Offset, 1);
+ MI.setDesc(get(VE::LDrii));
+ MI.getOperand(0).setReg(DestHiReg);
+ Offset += 8;
+ replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
+void EliminateFrameIndex::processMI(MachineInstr &MI, Register FrameReg,
+ int64_t Offset, int FIOperandNum) {
+ switch (MI.getOpcode()) {
+ case VE::STQrii:
+ processSTQ(MI, FrameReg, Offset, FIOperandNum);
+ return;
+ case VE::LDQrii:
+ processLDQ(MI, FrameReg, Offset, FIOperandNum);
+ return;
+ }
+ prepareReplaceFI(MI, FrameReg, Offset);
+ replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
void VERegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
assert(SPAdj == 0 && "Unexpected");
MachineInstr &MI = *II;
- DebugLoc dl = MI.getDebugLoc();
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
MachineFunction &MF = *MI.getParent()->getParent();
- const VEFrameLowering *TFI = getFrameLowering(MF);
+ const VESubtarget &Subtarget = MF.getSubtarget<VESubtarget>();
+ const VEFrameLowering &TFI = *getFrameLowering(MF);
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ const VERegisterInfo &TRI = *Subtarget.getRegisterInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ EliminateFrameIndex EFI(TII, TRI, DL, *MI.getParent(), II);
+ // Retrieve FrameReg and byte offset for stack slot.
Register FrameReg;
- int Offset;
- Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg).getFixed();
-
+ int64_t Offset =
+ TFI.getFrameIndexReference(MF, FrameIndex, FrameReg).getFixed();
Offset += MI.getOperand(FIOperandNum + offsetToDisp(MI)).getImm();
- if (MI.getOpcode() == VE::STQrii) {
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- Register SrcReg = MI.getOperand(3).getReg();
- Register SrcHiReg = getSubReg(SrcReg, VE::sub_even);
- Register SrcLoReg = getSubReg(SrcReg, VE::sub_odd);
- // VE stores HiReg to 8(addr) and LoReg to 0(addr)
- MachineInstr *StMI = BuildMI(*MI.getParent(), II, dl, TII.get(VE::STrii))
- .addReg(FrameReg)
- .addImm(0)
- .addImm(0)
- .addReg(SrcLoReg);
- replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg);
- MI.setDesc(TII.get(VE::STrii));
- MI.getOperand(3).setReg(SrcHiReg);
- Offset += 8;
- } else if (MI.getOpcode() == VE::LDQrii) {
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- Register DestReg = MI.getOperand(0).getReg();
- Register DestHiReg = getSubReg(DestReg, VE::sub_even);
- Register DestLoReg = getSubReg(DestReg, VE::sub_odd);
- // VE loads HiReg from 8(addr) and LoReg from 0(addr)
- MachineInstr *StMI =
- BuildMI(*MI.getParent(), II, dl, TII.get(VE::LDrii), DestLoReg)
- .addReg(FrameReg)
- .addImm(0)
- .addImm(0);
- replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg);
- MI.setDesc(TII.get(VE::LDrii));
- MI.getOperand(0).setReg(DestHiReg);
- Offset += 8;
- }
-
- replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg);
+ EFI.processMI(MI, FrameReg, Offset, FIOperandNum);
}
Register VERegisterInfo::getFrameRegister(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp
index 330eef4c7c2b..f88f298bc603 100644
--- a/llvm/lib/Target/VE/VVPISelLowering.cpp
+++ b/llvm/lib/Target/VE/VVPISelLowering.cpp
@@ -41,7 +41,7 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
auto VVPOpcodeOpt = getVVPOpcode(Opcode);
if (!VVPOpcodeOpt)
return SDValue();
- unsigned VVPOpcode = VVPOpcodeOpt.getValue();
+ unsigned VVPOpcode = VVPOpcodeOpt.value();
const bool FromVP = ISD::isVPOpcode(Opcode);
// The representative and legalized vector type of this operation.
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
index ec72c1de0503..d31715e367ec 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
@@ -87,15 +87,14 @@ bool WebAssemblyAsmTypeCheck::popType(SMLoc ErrorLoc,
if (Stack.empty()) {
return typeError(ErrorLoc,
EVT ? StringRef("empty stack while popping ") +
- WebAssembly::typeToString(EVT.getValue())
+ WebAssembly::typeToString(EVT.value())
: StringRef("empty stack while popping value"));
}
auto PVT = Stack.pop_back_val();
- if (EVT && EVT.getValue() != PVT) {
+ if (EVT && EVT.value() != PVT) {
return typeError(
ErrorLoc, StringRef("popped ") + WebAssembly::typeToString(PVT) +
- ", expected " +
- WebAssembly::typeToString(EVT.getValue()));
+ ", expected " + WebAssembly::typeToString(EVT.value()));
}
return false;
}
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index f52545a65dbb..97dbc35c991b 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -26,6 +26,7 @@ using namespace llvm;
#define DEBUG_TYPE "wasm-mc-target-desc"
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "WebAssemblyGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 75d5d0675990..b5b12200505b 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -124,6 +124,7 @@ enum TOF {
// Defines symbolic names for the WebAssembly instructions.
//
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "WebAssemblyGenInstrInfo.inc"
namespace llvm {
diff --git a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
index e3daf6bfa72e..ef2c77ade8cc 100644
--- a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
@@ -37,4 +37,5 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTargetInfo() {
// which have to be in a shared location between CodeGen and MC.
#define GET_INSTRMAP_INFO 1
#define GET_INSTRINFO_ENUM 1
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "WebAssemblyGenInstrInfo.inc"
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
index 0f1655718481..f380b2582c65 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
@@ -13,6 +13,7 @@
#include "WebAssemblyTypeUtilities.h"
#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
// Get register classes enum.
#define GET_REGINFO_ENUM
@@ -168,6 +169,11 @@ wasm::ValType WebAssembly::regClassToValType(unsigned RC) {
}
}
+wasm::ValType WebAssembly::regClassToValType(const TargetRegisterClass *RC) {
+ assert(RC != nullptr);
+ return regClassToValType(RC->getID());
+}
+
void WebAssembly::wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT,
const SmallVector<MVT, 1> &VTs) {
assert(!Sym->getType());
@@ -175,33 +181,28 @@ void WebAssembly::wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT,
// Tables are represented as Arrays in LLVM IR therefore
// they reach this point as aggregate Array types with an element type
// that is a reference type.
- wasm::ValType Type;
+ wasm::ValType ValTy;
bool IsTable = false;
if (GlobalVT->isArrayTy() &&
WebAssembly::isRefType(GlobalVT->getArrayElementType())) {
- MVT VT;
IsTable = true;
- switch (GlobalVT->getArrayElementType()->getPointerAddressSpace()) {
- case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF:
- VT = MVT::funcref;
- break;
- case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF:
- VT = MVT::externref;
- break;
- default:
- report_fatal_error("unhandled address space type");
- }
- Type = WebAssembly::toValType(VT);
+ const Type *ElTy = GlobalVT->getArrayElementType();
+ if (WebAssembly::isExternrefType(ElTy))
+ ValTy = wasm::ValType::EXTERNREF;
+ else if (WebAssembly::isFuncrefType(ElTy))
+ ValTy = wasm::ValType::FUNCREF;
+ else
+ report_fatal_error("unhandled reference type");
} else if (VTs.size() == 1) {
- Type = WebAssembly::toValType(VTs[0]);
+ ValTy = WebAssembly::toValType(VTs[0]);
} else
report_fatal_error("Aggregate globals not yet implemented");
if (IsTable) {
Sym->setType(wasm::WASM_SYMBOL_TYPE_TABLE);
- Sym->setTableType(Type);
+ Sym->setTableType(ValTy);
} else {
Sym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
- Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), /*Mutable=*/true});
+ Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(ValTy), /*Mutable=*/true});
}
}
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
index 8fc67d37925c..86211700c70a 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
@@ -22,6 +22,9 @@
#include "llvm/Support/MachineValueType.h"
namespace llvm {
+
+class TargetRegisterClass;
+
namespace WebAssembly {
/// Used as immediate MachineOperands for block signatures
@@ -108,9 +111,12 @@ std::string signatureToString(const wasm::WasmSignature *Sig);
// Convert a MVT into its corresponding wasm ValType.
wasm::ValType toValType(MVT Type);
-// Convert a register class to a wasm ValType.
+// Convert a register class ID to a wasm ValType.
wasm::ValType regClassToValType(unsigned RC);
+// Convert a register class to a wasm ValType.
+wasm::ValType regClassToValType(const TargetRegisterClass *RC);
+
/// Sets a Wasm Symbol Type.
void wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT,
const SmallVector<MVT, 1> &VTs);
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp
index b87c884c9e4a..277bbee83a6f 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp
@@ -179,3 +179,25 @@ MachineInstr *WebAssembly::findCatch(MachineBasicBlock *EHPad) {
return &*Pos;
return nullptr;
}
+
+unsigned WebAssembly::getCopyOpcodeForRegClass(const TargetRegisterClass *RC) {
+ assert(RC != nullptr);
+ switch (RC->getID()) {
+ case WebAssembly::I32RegClassID:
+ return WebAssembly::COPY_I32;
+ case WebAssembly::I64RegClassID:
+ return WebAssembly::COPY_I64;
+ case WebAssembly::F32RegClassID:
+ return WebAssembly::COPY_F32;
+ case WebAssembly::F64RegClassID:
+ return WebAssembly::COPY_F64;
+ case WebAssembly::V128RegClassID:
+ return WebAssembly::COPY_V128;
+ case WebAssembly::FUNCREFRegClassID:
+ return WebAssembly::COPY_FUNCREF;
+ case WebAssembly::EXTERNREFRegClassID:
+ return WebAssembly::COPY_EXTERNREF;
+ default:
+ llvm_unreachable("Unexpected register class");
+ }
+}
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
index cdfc758db7ac..d0639208fda9 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
@@ -24,6 +24,7 @@ class MachineInstr;
class MachineOperand;
class MCContext;
class MCSymbolWasm;
+class TargetRegisterClass;
class WebAssemblyFunctionInfo;
class WebAssemblySubtarget;
@@ -65,6 +66,9 @@ getOrCreateFuncrefCallTableSymbol(MCContext &Ctx,
/// instruction found or the catch is in an invalid location.
MachineInstr *findCatch(MachineBasicBlock *EHPad);
+/// Returns the appropriate copy opcode for the given register class.
+unsigned getCopyOpcodeForRegClass(const TargetRegisterClass *RC);
+
} // end namespace WebAssembly
} // end namespace llvm
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 57d51634e849..bcb6cf1b4e1d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -597,6 +597,8 @@ void WebAssemblyAsmPrinter::emitFunctionBodyStart() {
void WebAssemblyAsmPrinter::emitInstruction(const MachineInstr *MI) {
LLVM_DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
+ WebAssembly_MC::verifyInstructionPredicates(MI->getOpcode(),
+ Subtarget->getFeatureBits());
switch (MI->getOpcode()) {
case WebAssembly::ARGUMENT_i32:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 02e873a0f9a6..d2eb4b29e9fd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -781,25 +781,6 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
}
}
-// Get the appropriate copy opcode for the given register class.
-static unsigned getCopyOpcode(const TargetRegisterClass *RC) {
- if (RC == &WebAssembly::I32RegClass)
- return WebAssembly::COPY_I32;
- if (RC == &WebAssembly::I64RegClass)
- return WebAssembly::COPY_I64;
- if (RC == &WebAssembly::F32RegClass)
- return WebAssembly::COPY_F32;
- if (RC == &WebAssembly::F64RegClass)
- return WebAssembly::COPY_F64;
- if (RC == &WebAssembly::V128RegClass)
- return WebAssembly::COPY_V128;
- if (RC == &WebAssembly::FUNCREFRegClass)
- return WebAssembly::COPY_FUNCREF;
- if (RC == &WebAssembly::EXTERNREFRegClass)
- return WebAssembly::COPY_EXTERNREF;
- llvm_unreachable("Unexpected register class");
-}
-
// When MBB is split into MBB and Split, we should unstackify defs in MBB that
// have their uses in Split.
static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
@@ -851,7 +832,8 @@ static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
if (!MFI.isVRegStackified(TeeReg)) {
// Now we are not using TEE anymore, so unstackify DefReg too
MFI.unstackifyVReg(DefReg);
- unsigned CopyOpc = getCopyOpcode(MRI.getRegClass(DefReg));
+ unsigned CopyOpc =
+ WebAssembly::getCopyOpcodeForRegClass(MRI.getRegClass(DefReg));
BuildMI(MBB, &MI, MI.getDebugLoc(), TII.get(CopyOpc), TeeReg)
.addReg(DefReg);
BuildMI(MBB, &MI, MI.getDebugLoc(), TII.get(CopyOpc), Reg).addReg(DefReg);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 5484c0db7775..9316826e3d92 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -66,23 +66,7 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
? MRI.getRegClass(DestReg)
: MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(DestReg);
- unsigned CopyOpcode;
- if (RC == &WebAssembly::I32RegClass)
- CopyOpcode = WebAssembly::COPY_I32;
- else if (RC == &WebAssembly::I64RegClass)
- CopyOpcode = WebAssembly::COPY_I64;
- else if (RC == &WebAssembly::F32RegClass)
- CopyOpcode = WebAssembly::COPY_F32;
- else if (RC == &WebAssembly::F64RegClass)
- CopyOpcode = WebAssembly::COPY_F64;
- else if (RC == &WebAssembly::V128RegClass)
- CopyOpcode = WebAssembly::COPY_V128;
- else if (RC == &WebAssembly::FUNCREFRegClass)
- CopyOpcode = WebAssembly::COPY_FUNCREF;
- else if (RC == &WebAssembly::EXTERNREFRegClass)
- CopyOpcode = WebAssembly::COPY_EXTERNREF;
- else
- llvm_unreachable("Unexpected register class");
+ unsigned CopyOpcode = WebAssembly::getCopyOpcodeForRegClass(RC);
BuildMI(MBB, I, DL, get(CopyOpcode), DestReg)
.addReg(SrcReg, KillSrc ? RegState::Kill : 0);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 2db4bd822349..7a1a769c6b16 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -553,7 +553,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) {
std::tie(SizeArg, NEltArg) = FnAttrs.getAllocSizeArgs();
SizeArg += 1;
if (NEltArg)
- NEltArg = NEltArg.getValue() + 1;
+ NEltArg = NEltArg.value() + 1;
FnAttrs.addAllocSizeAttr(SizeArg, NEltArg);
}
// In case the callee has 'noreturn' attribute, We need to remove it, because
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 2e6027a5605c..e8b3542df12f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -154,25 +154,6 @@ MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand(
return MCOperand::createExpr(Expr);
}
-// Return the WebAssembly type associated with the given register class.
-static wasm::ValType getType(const TargetRegisterClass *RC) {
- if (RC == &WebAssembly::I32RegClass)
- return wasm::ValType::I32;
- if (RC == &WebAssembly::I64RegClass)
- return wasm::ValType::I64;
- if (RC == &WebAssembly::F32RegClass)
- return wasm::ValType::F32;
- if (RC == &WebAssembly::F64RegClass)
- return wasm::ValType::F64;
- if (RC == &WebAssembly::V128RegClass)
- return wasm::ValType::V128;
- if (RC == &WebAssembly::EXTERNREFRegClass)
- return wasm::ValType::EXTERNREF;
- if (RC == &WebAssembly::FUNCREFRegClass)
- return wasm::ValType::FUNCREF;
- llvm_unreachable("Unexpected register class");
-}
-
static void getFunctionReturns(const MachineInstr *MI,
SmallVectorImpl<wasm::ValType> &Returns) {
const Function &F = MI->getMF()->getFunction();
@@ -221,10 +202,12 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
const MachineRegisterInfo &MRI =
MI->getParent()->getParent()->getRegInfo();
for (const MachineOperand &MO : MI->defs())
- Returns.push_back(getType(MRI.getRegClass(MO.getReg())));
+ Returns.push_back(
+ WebAssembly::regClassToValType(MRI.getRegClass(MO.getReg())));
for (const MachineOperand &MO : MI->explicit_uses())
if (MO.isReg())
- Params.push_back(getType(MRI.getRegClass(MO.getReg())));
+ Params.push_back(
+ WebAssembly::regClassToValType(MRI.getRegClass(MO.getReg())));
// call_indirect instructions have a callee operand at the end which
// doesn't count as a param.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index ba1c4b7233f2..5fcee7af9bde 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "Utils/WebAssemblyUtilities.h"
#include "WebAssembly.h"
#include "WebAssemblyMachineFunctionInfo.h"
#include "WebAssemblySubtarget.h"
@@ -95,31 +96,7 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
if (!MFI.isVRegStackified(Reg)) {
unsigned CopyLocalOpc;
const TargetRegisterClass *RegClass = MRI.getRegClass(Reg);
- switch (RegClass->getID()) {
- case WebAssembly::I32RegClassID:
- CopyLocalOpc = WebAssembly::COPY_I32;
- break;
- case WebAssembly::I64RegClassID:
- CopyLocalOpc = WebAssembly::COPY_I64;
- break;
- case WebAssembly::F32RegClassID:
- CopyLocalOpc = WebAssembly::COPY_F32;
- break;
- case WebAssembly::F64RegClassID:
- CopyLocalOpc = WebAssembly::COPY_F64;
- break;
- case WebAssembly::V128RegClassID:
- CopyLocalOpc = WebAssembly::COPY_V128;
- break;
- case WebAssembly::FUNCREFRegClassID:
- CopyLocalOpc = WebAssembly::COPY_FUNCREF;
- break;
- case WebAssembly::EXTERNREFRegClassID:
- CopyLocalOpc = WebAssembly::COPY_EXTERNREF;
- break;
- default:
- llvm_unreachable("Unexpected register class for return operand");
- }
+ CopyLocalOpc = WebAssembly::getCopyOpcodeForRegClass(RegClass);
Register NewReg = MRI.createVirtualRegister(RegClass);
BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
.addReg(Reg);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 388c0f9110b7..0b3e534315d5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -21,7 +21,6 @@
#include "WebAssemblyRuntimeLibcallSignatures.h"
#include "WebAssemblySubtarget.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
-#include "llvm/Support/ManagedStatic.h"
using namespace llvm;
@@ -482,10 +481,13 @@ struct RuntimeLibcallSignatureTable {
}
};
-ManagedStatic<RuntimeLibcallSignatureTable> RuntimeLibcallSignatures;
+RuntimeLibcallSignatureTable &getRuntimeLibcallSignatures() {
+ static RuntimeLibcallSignatureTable RuntimeLibcallSignatures;
+ return RuntimeLibcallSignatures;
+}
// Maps libcall names to their RTLIB::Libcall number. Builds the map in a
-// constructor for use with ManagedStatic
+// constructor for use with a static variable
struct StaticLibcallNameMap {
StringMap<RTLIB::Libcall> Map;
StaticLibcallNameMap() {
@@ -496,7 +498,8 @@ struct StaticLibcallNameMap {
};
for (const auto &NameLibcall : NameLibcalls) {
if (NameLibcall.first != nullptr &&
- RuntimeLibcallSignatures->Table[NameLibcall.second] != unsupported) {
+ getRuntimeLibcallSignatures().Table[NameLibcall.second] !=
+ unsupported) {
assert(Map.find(NameLibcall.first) == Map.end() &&
"duplicate libcall names in name map");
Map[NameLibcall.first] = NameLibcall.second;
@@ -523,7 +526,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
wasm::ValType PtrTy =
Subtarget.hasAddr64() ? wasm::ValType::I64 : wasm::ValType::I32;
- auto &Table = RuntimeLibcallSignatures->Table;
+ auto &Table = getRuntimeLibcallSignatures().Table;
switch (Table[LC]) {
case func:
break;
@@ -885,14 +888,14 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
}
}
-static ManagedStatic<StaticLibcallNameMap> LibcallNameMap;
// TODO: If the RTLIB::Libcall-taking flavor of GetSignature remains unsed
// other than here, just roll its logic into this version.
void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
StringRef Name,
SmallVectorImpl<wasm::ValType> &Rets,
SmallVectorImpl<wasm::ValType> &Params) {
- auto &Map = LibcallNameMap->Map;
+ static StaticLibcallNameMap LibcallNameMap;
+ auto &Map = LibcallNameMap.Map;
auto Val = Map.find(Name);
#ifndef NDEBUG
if (Val == Map.end()) {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index a903c5f455a2..da90befb2320 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -622,7 +622,7 @@ static bool printFMAComments(const MCInst *MI, raw_ostream &OS,
OS << '-';
OS << '(' << Mul1Name << " * " << Mul2Name << ") " << AccStr << ' '
- << AccName;
+ << AccName << '\n';
return true;
}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp
index 901082ce6cf3..640efd468135 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp
@@ -13,6 +13,7 @@
#include "X86InstrRelaxTables.h"
#include "X86InstrInfo.h"
#include "llvm/ADT/STLExtras.h"
+#include <atomic>
using namespace llvm;
@@ -119,7 +120,7 @@ const X86InstrRelaxTableEntry *llvm::lookupRelaxTable(unsigned ShortOp) {
namespace {
// This class stores the short form tables. It is instantiated as a
-// ManagedStatic to lazily init the short form table.
+// function scope static variable to lazily init the short form table.
struct X86ShortFormTable {
// Stores relaxation table entries sorted by relaxed form opcode.
SmallVector<X86InstrRelaxTableEntry, 0> Table;
@@ -137,10 +138,9 @@ struct X86ShortFormTable {
};
} // namespace
-static ManagedStatic<X86ShortFormTable> ShortTable;
-
const X86InstrRelaxTableEntry *llvm::lookupShortTable(unsigned RelaxOp) {
- auto &Table = ShortTable->Table;
+ static X86ShortFormTable ShortTable;
+ auto &Table = ShortTable.Table;
auto I = llvm::lower_bound(Table, RelaxOp);
if (I != Table.end() && I->KeyOp == RelaxOp)
return &*I;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 49660883ad83..4c962de16530 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -37,6 +37,7 @@ using namespace llvm;
#define GET_INSTRINFO_MC_DESC
#define GET_INSTRINFO_MC_HELPERS
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "X86GenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 7344900f2e31..0ac916527495 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -132,6 +132,9 @@ FunctionPass *createX86EvexToVexInsts();
/// This pass creates the thunks for the retpoline feature.
FunctionPass *createX86IndirectThunksPass();
+/// This pass replaces ret instructions with jmp's to __x86_return thunk.
+FunctionPass *createX86ReturnThunksPass();
+
/// This pass ensures instructions featuring a memory operand
/// have distinctive <LineNumber, Discriminator> (with respect to eachother)
FunctionPass *createX86DiscriminateMemOpsPass();
@@ -185,6 +188,7 @@ void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
void initializeX86PreAMXConfigPassPass(PassRegistry &);
void initializeX86LowerTileCopyPass(PassRegistry &);
void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &);
+void initializeX86ReturnThunksPass(PassRegistry &);
namespace X86AS {
enum : unsigned {
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index a5c6b40c493c..a859176220c7 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -266,6 +266,8 @@ def FeatureWBNOINVD : SubtargetFeature<"wbnoinvd", "HasWBNOINVD", "true",
"Write Back No Invalidate">;
def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true",
"Support RDPID instructions">;
+def FeatureRDPRU : SubtargetFeature<"rdpru", "HasRDPRU", "true",
+ "Support RDPRU instructions">;
def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
"Wait and pause enhancements">;
def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true",
@@ -1238,6 +1240,7 @@ def ProcessorFeatures {
TuningInsertVZEROUPPER];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
+ FeatureRDPRU,
FeatureWBNOINVD];
list<SubtargetFeature> ZN2Tuning = ZNTuning;
list<SubtargetFeature> ZN2Features =
diff --git a/llvm/lib/Target/X86/X86EvexToVex.cpp b/llvm/lib/Target/X86/X86EvexToVex.cpp
index c7a013a0b17a..cff95d17c14c 100644
--- a/llvm/lib/Target/X86/X86EvexToVex.cpp
+++ b/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -31,6 +31,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
+#include <atomic>
#include <cassert>
#include <cstdint>
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 61c1fd25031d..12af6087cb47 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -594,7 +594,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Half type will be promoted by default.
setOperationAction(ISD::FABS, MVT::f16, Promote);
setOperationAction(ISD::FNEG, MVT::f16, Promote);
- setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
setOperationAction(ISD::FADD, MVT::f16, Promote);
setOperationAction(ISD::FSUB, MVT::f16, Promote);
setOperationAction(ISD::FMUL, MVT::f16, Promote);
@@ -629,6 +629,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall);
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+
+ setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, LibCall);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, LibCall);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
@@ -2817,6 +2845,21 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
AddressSpace = X86AS::FS;
else if (GuardReg == "gs")
AddressSpace = X86AS::GS;
+
+ // Use symbol guard if user specify.
+ StringRef GuardSymb = M->getStackProtectorGuardSymbol();
+ if (!GuardSymb.empty()) {
+ GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
+ if (!GV) {
+ Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
+ : Type::getInt32Ty(M->getContext());
+ GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
+ nullptr, GuardSymb, nullptr,
+ GlobalValue::NotThreadLocal, AddressSpace);
+ }
+ return GV;
+ }
+
return SegmentOffset(IRB, Offset, AddressSpace);
}
}
@@ -11757,15 +11800,17 @@ static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
/// value in ExpectedMask is always accepted. Otherwise the indices must match.
///
/// SM_SentinelZero is accepted as a valid negative index but must match in
-/// both.
+/// both, or via a known bits test.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
ArrayRef<int> ExpectedMask,
+ const SelectionDAG &DAG,
SDValue V1 = SDValue(),
SDValue V2 = SDValue()) {
int Size = Mask.size();
if (Size != (int)ExpectedMask.size())
return false;
- assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
+ assert(llvm::all_of(ExpectedMask,
+ [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
"Illegal target shuffle mask");
// Check for out-of-range target shuffle mask indices.
@@ -11778,12 +11823,28 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
V2 = SDValue();
+ APInt ZeroV1 = APInt::getNullValue(Size);
+ APInt ZeroV2 = APInt::getNullValue(Size);
+
for (int i = 0; i < Size; ++i) {
int MaskIdx = Mask[i];
int ExpectedIdx = ExpectedMask[i];
if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
continue;
- if (0 <= MaskIdx && 0 <= ExpectedIdx) {
+ if (MaskIdx == SM_SentinelZero) {
+ // If we need this expected index to be a zero element, then update the
+ // relevant zero mask and perform the known bits at the end to minimize
+ // repeated computes.
+ SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
+ if (ExpectedV &&
+ Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
+ int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
+ APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
+ ZeroMask.setBit(BitIdx);
+ continue;
+ }
+ }
+ if (MaskIdx >= 0) {
SDValue MaskV = MaskIdx < Size ? V1 : V2;
SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
@@ -11791,15 +11852,16 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
continue;
}
- // TODO - handle SM_Sentinel equivalences.
return false;
}
- return true;
+ return (ZeroV1.isNullValue() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
+ (ZeroV2.isNullValue() || DAG.MaskedVectorIsZero(V2, ZeroV2));
}
// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
// instructions.
-static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
+static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,
+ const SelectionDAG &DAG) {
if (VT != MVT::v8i32 && VT != MVT::v8f32)
return false;
@@ -11809,12 +11871,13 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
SmallVector<int, 8> Unpckhwd;
createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
/* Unary = */ false);
- bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
- isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
+ bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
+ isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
return IsUnpackwdMask;
}
-static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
+static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,
+ const SelectionDAG &DAG) {
// Create 128-bit vector type based on mask size.
MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
MVT VT = MVT::getVectorVT(EltVT, Mask.size());
@@ -11827,8 +11890,8 @@ static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
for (unsigned i = 0; i != 4; ++i) {
SmallVector<int, 16> UnpackMask;
createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
- if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
- isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
+ if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
+ isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
return true;
}
return false;
@@ -12021,7 +12084,7 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
// Attempt to match the target mask against the unpack lo/hi mask patterns.
SmallVector<int, 64> Unpckl, Unpckh;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
- if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
(IsUnary ? V1 : V2))) {
UnpackOpcode = X86ISD::UNPCKL;
V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
@@ -12030,7 +12093,7 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
}
createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
- if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
(IsUnary ? V1 : V2))) {
UnpackOpcode = X86ISD::UNPCKH;
V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
@@ -12069,14 +12132,14 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
// If a binary shuffle, commute and try again.
if (!IsUnary) {
ShuffleVectorSDNode::commuteMask(Unpckl);
- if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
UnpackOpcode = X86ISD::UNPCKL;
std::swap(V1, V2);
return true;
}
ShuffleVectorSDNode::commuteMask(Unpckh);
- if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
UnpackOpcode = X86ISD::UNPCKH;
std::swap(V1, V2);
return true;
@@ -12464,14 +12527,14 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
// Try binary shuffle.
SmallVector<int, 32> BinaryMask;
createPackShuffleMask(VT, BinaryMask, false, NumStages);
- if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
+ if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
if (MatchPACK(V1, V2, PackVT))
return true;
// Try unary shuffle.
SmallVector<int, 32> UnaryMask;
createPackShuffleMask(VT, UnaryMask, true, NumStages);
- if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
+ if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
if (MatchPACK(V1, V1, PackVT))
return true;
}
@@ -14283,7 +14346,7 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
// because that avoids a constant load from memory.
if (NumElts == 4 &&
- (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
+ (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
return SDValue();
// Extend the shuffle mask with undef elements.
@@ -17230,7 +17293,7 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
if (Subtarget.hasAVX2()) {
// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
- !is128BitUnpackShuffleMask(HalfMask) &&
+ !is128BitUnpackShuffleMask(HalfMask, DAG) &&
(!isSingleSHUFPSMask(HalfMask) ||
Subtarget.hasFastVariableCrossLaneShuffle()))
return SDValue();
@@ -17892,7 +17955,7 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
// since after split we get a more efficient code using vpunpcklwd and
// vpunpckhwd instrs than vblend.
- if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
+ if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
DAG);
@@ -17930,7 +17993,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
// since after split we get a more efficient code than vblend by using
// vpunpcklwd and vpunpckhwd instrs.
- if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
+ if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
!Subtarget.hasAVX512())
return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
DAG);
@@ -27887,11 +27950,14 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
}
// Read Performance Monitoring Counters.
case RDPMC:
+ // Read Processor Register.
+ case RDPRU:
// GetExtended Control Register.
case XGETBV: {
SmallVector<SDValue, 2> Results;
// RDPMC uses ECX to select the index of the performance counter to read.
+ // RDPRU uses ECX to select the processor register to read.
// XGETBV uses ECX to select the index of the XCR register to return.
// The result is stored into registers EDX:EAX.
expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
@@ -29902,14 +29968,12 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
{4, 5, 6, 7, -1, -1, -1, -1});
- Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
- {0, 1, 1, 1, -1, -1, -1, -1});
- Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
- {2, 3, 3, 3, -1, -1, -1, -1});
- Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
- {0, 1, 1, 1, -1, -1, -1, -1});
- Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
- {2, 3, 3, 3, -1, -1, -1, -1});
+ SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
+ SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
+ Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
+ Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
+ Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
+ Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
}
}
@@ -30797,6 +30861,8 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::UMin:
case AtomicRMWInst::FAdd:
case AtomicRMWInst::FSub:
+ case AtomicRMWInst::FMax:
+ case AtomicRMWInst::FMin:
// These always require a non-trivial set of data operations on x86. We must
// use a cmpxchg loop.
return AtomicExpansionKind::CmpXChg;
@@ -32894,6 +32960,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
Results);
return;
+ case Intrinsic::x86_rdpru:
+ expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
+ Results);
+ return;
case Intrinsic::x86_xgetbv:
expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
Results);
@@ -36985,8 +37055,9 @@ static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
bool AllowFloatDomain, bool AllowIntDomain,
- SDValue V1, const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
+ SDValue V1, const SelectionDAG &DAG,
+ const X86Subtarget &Subtarget, unsigned &Shuffle,
+ MVT &SrcVT, MVT &DstVT) {
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
@@ -37057,17 +37128,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v2f64;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v4f32;
return true;
@@ -37076,17 +37147,19 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (MaskVT.is256BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v4f64;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
+ V1)) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v8f32;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
+ V1)) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v8f32;
return true;
@@ -37096,21 +37169,22 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (MaskVT.is512BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX512() &&
"AVX512 required for 512-bit vector shuffles");
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
+ V1)) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v8f64;
return true;
}
if (isTargetShuffleEquivalent(
MaskVT, Mask,
- {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
+ {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v16f32;
return true;
}
if (isTargetShuffleEquivalent(
MaskVT, Mask,
- {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
+ {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v16f32;
return true;
@@ -37126,6 +37200,7 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
const APInt &Zeroable,
bool AllowFloatDomain, bool AllowIntDomain,
+ const SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT,
unsigned &PermuteImm) {
@@ -37269,33 +37344,36 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
if (MaskVT.is128BitVector()) {
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
+ AllowFloatDomain) {
V2 = V1;
V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
+ AllowFloatDomain) {
V2 = V1;
Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
std::swap(V1, V2);
Shuffle = X86ISD::MOVSD;
SrcVT = DstVT = MVT::v2f64;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
(AllowFloatDomain || !Subtarget.hasSSE41())) {
Shuffle = X86ISD::MOVSS;
SrcVT = DstVT = MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}) &&
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
+ DAG) &&
Subtarget.hasFP16()) {
Shuffle = X86ISD::MOVSH;
SrcVT = DstVT = MVT::v8f16;
@@ -37678,7 +37756,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
scaleShuffleElements(Mask, NumElts, ScaledMask)) {
for (unsigned i = 0; i != NumElts; ++i)
IdentityMask.push_back(i);
- if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
+ if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
+ V2))
return CanonicalizeShuffleInput(RootVT, V1);
}
}
@@ -37902,7 +37981,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
- Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
+ DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
(!IsMaskedShuffle ||
(NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
@@ -37913,7 +37992,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
- AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
+ AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
PermuteImm) &&
(!IsMaskedShuffle ||
(NumRootElts == ShuffleVT.getVectorNumElements()))) {
@@ -37931,7 +38010,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// TODO: Handle other insertions here as well?
if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
Subtarget.hasSSE41() &&
- !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
+ !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
if (MaskEltSizeInBits == 32) {
SDValue SrcV1 = V1, SrcV2 = V2;
if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
@@ -37947,12 +38026,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
if (MaskEltSizeInBits == 64 &&
- isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
+ isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
V2.getScalarValueSizeInBits() <= 32) {
if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
return SDValue(); // Nothing to do!
- PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
+ PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
CanonicalizeShuffleInput(MVT::v4f32, V1),
CanonicalizeShuffleInput(MVT::v4f32, V2),
@@ -51654,9 +51733,13 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
// Otherwise use PCMPEQ (plus AND) and mask testing.
- if ((OpSize == 128 && Subtarget.hasSSE2()) ||
- (OpSize == 256 && Subtarget.hasAVX()) ||
- (OpSize == 512 && Subtarget.useAVX512Regs())) {
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+ ((OpSize == 128 && Subtarget.hasSSE2()) ||
+ (OpSize == 256 && Subtarget.hasAVX()) ||
+ (OpSize == 512 && Subtarget.useAVX512Regs()))) {
bool HasPT = Subtarget.hasSSE41();
// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index a55b95960aa6..6124755ca539 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1532,44 +1532,6 @@ def : Pat<(xor GR32:$src1, -2147483648),
}
//===----------------------------------------------------------------------===//
-// Pattern match SUB as XOR
-//===----------------------------------------------------------------------===//
-
-// An immediate in the LHS of a subtract can't be encoded in the instruction.
-// If there is no possibility of a borrow we can use an XOR instead of a SUB
-// to enable the immediate to be folded.
-// TODO: Move this to a DAG combine?
-
-def sub_is_xor : PatFrag<(ops node:$lhs, node:$rhs), (sub node:$lhs, node:$rhs),[{
- if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
- KnownBits Known = CurDAG->computeKnownBits(N->getOperand(1));
-
- // If all possible ones in the RHS are set in the LHS then there can't be
- // a borrow and we can use xor.
- return (~Known.Zero).isSubsetOf(CN->getAPIntValue());
- }
-
- return false;
-}]>;
-
-let AddedComplexity = 5 in {
-def : Pat<(sub_is_xor imm:$src2, GR8:$src1),
- (XOR8ri GR8:$src1, imm:$src2)>;
-def : Pat<(sub_is_xor i16immSExt8:$src2, GR16:$src1),
- (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(sub_is_xor imm:$src2, GR16:$src1),
- (XOR16ri GR16:$src1, imm:$src2)>;
-def : Pat<(sub_is_xor i32immSExt8:$src2, GR32:$src1),
- (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
-def : Pat<(sub_is_xor imm:$src2, GR32:$src1),
- (XOR32ri GR32:$src1, imm:$src2)>;
-def : Pat<(sub_is_xor i64immSExt8:$src2, GR64:$src1),
- (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(sub_is_xor i64immSExt32:$src2, GR64:$src1),
- (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
-}
-
-//===----------------------------------------------------------------------===//
// Some peepholes
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
index 52b2a62316cd..c4317be664fd 100644
--- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -13,8 +13,8 @@
#include "X86InstrFMA3Info.h"
#include "X86InstrInfo.h"
-#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/Threading.h"
+#include <atomic>
#include <cassert>
#include <cstdint>
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 27220a8d4d99..8aeb169929f2 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -13,6 +13,7 @@
#include "X86InstrFoldTables.h"
#include "X86InstrInfo.h"
#include "llvm/ADT/STLExtras.h"
+#include <atomic>
#include <vector>
using namespace llvm;
@@ -6102,7 +6103,7 @@ llvm::lookupFoldTable(unsigned RegOp, unsigned OpNum) {
namespace {
// This class stores the memory unfolding tables. It is instantiated as a
-// ManagedStatic to lazily init the unfolding table.
+// function scope static variable to lazily init the unfolding table.
struct X86MemUnfoldTable {
// Stores memory unfolding tables entries sorted by opcode.
std::vector<X86MemoryFoldTableEntry> Table;
@@ -6159,11 +6160,10 @@ struct X86MemUnfoldTable {
};
}
-static ManagedStatic<X86MemUnfoldTable> MemUnfoldTable;
-
const X86MemoryFoldTableEntry *
llvm::lookupUnfoldTable(unsigned MemOp) {
- auto &Table = MemUnfoldTable->Table;
+ static X86MemUnfoldTable MemUnfoldTable;
+ auto &Table = MemUnfoldTable.Table;
auto I = llvm::lower_bound(Table, MemOp);
if (I != Table.end() && I->KeyOp == MemOp)
return &*I;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 7f6ef3479d40..4a9a281d5b99 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -978,6 +978,7 @@ def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
def HasCLWB : Predicate<"Subtarget->hasCLWB()">;
def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">;
def HasRDPID : Predicate<"Subtarget->hasRDPID()">;
+def HasRDPRU : Predicate<"Subtarget->hasRDPRU()">;
def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">;
def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">;
def HasCX8 : Predicate<"Subtarget->hasCX8()">;
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index 3a653a56e534..b1ca87279007 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -735,6 +735,15 @@ def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst),
} // SchedRW
//===----------------------------------------------------------------------===//
+// RDPRU - Read Processor Register instruction.
+
+let SchedRW = [WriteSystem] in {
+let Uses = [ECX], Defs = [EAX, EDX] in
+ def RDPRU : I<0x01, MRM_FD, (outs), (ins), "rdpru", []>, PS,
+ Requires<[HasRDPRU]>;
+}
+
+//===----------------------------------------------------------------------===//
// Platform Configuration instruction
// From ISA docs:
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 3c8be95b43e3..6112c0b7d6c3 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -37,7 +37,7 @@ enum IntrinsicType : uint16_t {
TRUNCATE_TO_REG, CVTPS2PH_MASK, CVTPD2DQ_MASK, CVTQQ2PS_MASK,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
FIXUPIMM, FIXUPIMM_MASKZ, GATHER_AVX2,
- ROUNDP, ROUNDS
+ ROUNDP, ROUNDS, RDPRU
};
struct IntrinsicData {
@@ -309,6 +309,7 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0),
X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0),
X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0),
+ X86_INTRINSIC_DATA(rdpru, RDPRU, X86::RDPRU, 0),
X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0),
X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0),
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index b107de692365..3fbdb18a0793 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -2413,6 +2413,10 @@ static void addConstantComments(const MachineInstr *MI,
}
void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
+ // FIXME: Enable feature predicate checks once all the test pass.
+ // X86_MC::verifyInstructionPredicates(MI->getOpcode(),
+ // Subtarget->getFeatureBits());
+
X86MCInstLower MCInstLowering(*MF, *this);
const X86RegisterInfo *RI =
MF->getSubtarget<X86Subtarget>().getRegisterInfo();
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index 7761f7323358..c760a32e2579 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -439,8 +439,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
while (!Worklist.empty()) {
Value *V = Worklist.pop_back_val();
- if (!Visited.insert(V).second)
- continue;
+ if (!Visited.insert(V).second)
+ continue;
if (auto *PN = dyn_cast<PHINode>(V)) {
// PHI node should have single use unless it is the root node, then it
@@ -466,7 +466,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
// gets us back to this node.
if (BO->hasNUses(BO == Root ? 3 : 2)) {
PHINode *PN = nullptr;
- for (auto *U : Root->users())
+ for (auto *U : BO->users())
if (auto *P = dyn_cast<PHINode>(U))
if (!Visited.count(P))
PN = P;
diff --git a/llvm/lib/Target/X86/X86ReturnThunks.cpp b/llvm/lib/Target/X86/X86ReturnThunks.cpp
new file mode 100644
index 000000000000..4b203229ba83
--- /dev/null
+++ b/llvm/lib/Target/X86/X86ReturnThunks.cpp
@@ -0,0 +1,92 @@
+//==- X86ReturnThunks.cpp - Replace rets with thunks or inline thunks --=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Pass that replaces ret instructions with a jmp to __x86_return_thunk.
+///
+/// This corresponds to -mfunction-return=thunk-extern or
+/// __attribute__((function_return("thunk-extern").
+///
+/// This pass is a minimal implementation necessary to help mitigate
+/// RetBleed for the Linux kernel.
+///
+/// Should support for thunk or thunk-inline be necessary in the future, then
+/// this pass should be combined with x86-retpoline-thunks which already has
+/// machinery to emit thunks. Until then, YAGNI.
+///
+/// This pass is very similar to x86-lvi-ret.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define PASS_KEY "x86-return-thunks"
+#define DEBUG_TYPE PASS_KEY
+
+struct X86ReturnThunks final : public MachineFunctionPass {
+ static char ID;
+ X86ReturnThunks() : MachineFunctionPass(ID) {}
+ StringRef getPassName() const override { return "X86 Return Thunks"; }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+char X86ReturnThunks::ID = 0;
+
+bool X86ReturnThunks::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << getPassName() << "\n");
+
+ bool Modified = false;
+
+ if (!MF.getFunction().hasFnAttribute(llvm::Attribute::FnRetThunkExtern))
+ return Modified;
+
+ StringRef ThunkName = "__x86_return_thunk";
+ if (MF.getFunction().getName() == ThunkName)
+ return Modified;
+
+ const auto &ST = MF.getSubtarget<X86Subtarget>();
+ const bool Is64Bit = ST.getTargetTriple().getArch() == Triple::x86_64;
+ const unsigned RetOpc = Is64Bit ? X86::RET64 : X86::RET32;
+ SmallVector<MachineInstr *, 16> Rets;
+
+ for (MachineBasicBlock &MBB : MF)
+ for (MachineInstr &Term : MBB.terminators())
+ if (Term.getOpcode() == RetOpc)
+ Rets.push_back(&Term);
+
+ const MCInstrDesc &JMP = ST.getInstrInfo()->get(X86::TAILJMPd);
+
+ for (MachineInstr *Ret : Rets) {
+ BuildMI(Ret->getParent(), Ret->getDebugLoc(), JMP)
+ .addExternalSymbol(ThunkName.data());
+ Ret->eraseFromParent();
+ Modified = true;
+ }
+
+ return Modified;
+}
+
+INITIALIZE_PASS(X86ReturnThunks, PASS_KEY, "X86 Return Thunks", false, false)
+
+FunctionPass *llvm::createX86ReturnThunksPass() {
+ return new X86ReturnThunks();
+}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 4249788e3540..f4e25e4194db 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -100,6 +100,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeX86OptimizeLEAPassPass(PR);
initializeX86PartialReductionPass(PR);
initializePseudoProbeInserterPass(PR);
+ initializeX86ReturnThunksPass(PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -575,6 +576,7 @@ void X86PassConfig::addPreEmitPass2() {
// hand inspection of the codegen output.
addPass(createX86SpeculativeExecutionSideEffectSuppression());
addPass(createX86IndirectThunksPass());
+ addPass(createX86ReturnThunksPass());
// Insert extra int3 instructions after trailing call instructions to avoid
// issues in the unwinder.
diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index c286b747a271..a782ff436dc0 100644
--- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -29,6 +29,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "XCoreGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
index 096b22415a22..ec4418333859 100644
--- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
+++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
@@ -22,6 +22,7 @@
// Defines symbolic names for the XCore instructions.
//
#define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "XCoreGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
index 8fea61d125d2..691fdf16bc0f 100644
--- a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -256,6 +256,9 @@ bool XCoreAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
}
void XCoreAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ XCore_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
SmallString<128> Str;
raw_svector_ostream O(Str);